From 4c351465b3910948a5f2e27e69dfe8a59d3c25c4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:11:05 +0000
Subject: [PATCH 1/9] Initial plan


From e0477aba2ecb9f398e4e5ba2fba2af86774edd12 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:20:22 +0000
Subject: [PATCH 2/9] Add URL crawling functionality with comprehensive tests

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/crawler.integration.test.js | 200 +++++++++++++
 src/crawler.js                  | 209 ++++++++++++++
 src/crawler.test.js             | 491 ++++++++++++++++++++++++++++++++
 src/utils.js                    |  52 ++++
 4 files changed, 952 insertions(+)
 create mode 100644 src/crawler.integration.test.js
 create mode 100644 src/crawler.js
 create mode 100644 src/crawler.test.js
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
new file mode 100644
index 0000000..d6f8797
--- /dev/null
+++ b/src/crawler.integration.test.js
@@ -0,0 +1,200 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+  const { expect } = await import("chai");
+  global.expect = expect;
+});
+
+describe("crawler integration", function () {
+  let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub;
+
+  beforeEach(function () {
+    axiosStub = {
+      get: sinon.stub(),
+    };
+    
+    fsStub = {
+      statSync: sinon.stub(),
+      readdirSync: sinon.stub(),
+      existsSync: sinon.stub(),
+      mkdirSync: sinon.stub(),
+      writeFileSync: sinon.stub(),
+    };
+    
+    logStub = sinon.stub();
+    crawlUrlsStub = sinon.stub();
+    readFileStub = sinon.stub().resolves({});
+    
+    // Mock fetchFile behavior
+    axiosStub.get.callsFake(async (url) => {
+      if (url === "https://example.com/page1") {
+        return {
+          data: '<html><a href="https://example.com/page2">Link</a></html>',
+        };
+      } else if (url === "https://example.com/page2") {
+        return { data: "<html>Content</html>" };
+      }
+      return { data: "" };
+    });
+    
+    const utilsModule = proxyquire("./utils", {
+      axios: axiosStub,
+      fs: fsStub,
+      "./crawler": { crawlUrls: crawlUrlsStub },
+      "doc-detective-common": {
+        validate: () => ({ valid: true }),
+        resolvePaths: (x) => x,
+        transformToSchemaKey: (x) => x,
+        readFile: readFileStub,
+      },
+    });
+    
+    qualifyFiles = utilsModule.qualifyFiles;
+  });
+
+  afterEach(function () {
+    sinon.restore();
+  });
+
+  it("should enable crawling by default for HTTP URLs", async function () {
+    const config = {
+      input: ["https://example.com/page1"],
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlUrlsStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlUrlsStub.calledOnce).to.be.true;
+    expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([
+      "https://example.com/page1",
+    ]);
+  });
+
+  it("should disable crawling when crawl is false", async function () {
+    const config = {
+      input: ["https://example.com/page1"],
+      crawl: false,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlUrlsStub.called).to.be.false;
+  });
+
+  it("should enable crawling when crawl is true", async function () {
+    const config = {
+      input: ["https://example.com/page1"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlUrlsStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlUrlsStub.calledOnce).to.be.true;
+  });
+
+  it("should not crawl file:// URLs by default", async function () {
+    const config = {
+      input: [],  // Empty input to avoid processing issues
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    // file:// URLs won't trigger crawling since they don't start with http:// or https://
+    // This test just verifies no crawling happens
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlUrlsStub.called).to.be.false;
+  });
+
+  it("should pass origin config to crawler", async function () {
+    const config = {
+      input: ["https://example.com/page1"],
+      origin: "https://example.com",
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlUrlsStub.resolves(["https://example.com/page1"]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlUrlsStub.calledOnce).to.be.true;
+    expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal(
+      "https://example.com"
+    );
+  });
+
+  it("should log crawling activity", async function () {
+    const config = {
+      input: ["https://example.com/page1"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlUrlsStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    // Capture console output
+    const originalConsoleLog = console.log;
+    const logOutput = [];
+    console.log = (...args) => {
+      logOutput.push(args.join(" "));
+      originalConsoleLog(...args);
+    };
+    
+    try {
+      await qualifyFiles({ config });
+      
+      // Check that crawling info was logged
+      const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling"));
+      const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
+      
+      expect(hasCrawlingLog).to.be.true;
+      expect(hasDiscoveredLog).to.be.true;
+    } finally {
+      console.log = originalConsoleLog;
+    }
+  });
+});
diff --git a/src/crawler.js b/src/crawler.js
new file mode 100644
index 0000000..62c2ed5
--- /dev/null
+++ b/src/crawler.js
@@ -0,0 +1,209 @@
+const axios = require("axios");
+const { log } = require("./utils");
+
+exports.extractHtmlUrls = extractHtmlUrls;
+exports.extractMarkdownUrls = extractMarkdownUrls;
+exports.isSameOrigin = isSameOrigin;
+exports.resolveRelativeUrl = resolveRelativeUrl;
+exports.crawlUrls = crawlUrls;
+
+/**
+ * Extracts URLs from HTML <a> tags with href attributes.
+ * 
+ * @param {string} html - The HTML content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractHtmlUrls(html) {
+  if (typeof html !== "string") {
+    return [];
+  }
+  
+  const urls = [];
+  // Match <a> tags with href attributes
+  // This regex handles various formats: href="url", href='url', href=url
+  const anchorRegex = /<a\s+(?:[^>]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi;
+  let match;
+  
+  while ((match = anchorRegex.exec(html)) !== null) {
+    const url = match[1];
+    if (url && url !== "#" && !url.startsWith("javascript:")) {
+      urls.push(url);
+    }
+  }
+  
+  return urls;
+}
+
+/**
+ * Extracts URLs from Markdown [text](url) syntax.
+ * 
+ * @param {string} markdown - The Markdown content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractMarkdownUrls(markdown) {
+  if (typeof markdown !== "string") {
+    return [];
+  }
+  
+  const urls = [];
+  // Match [text](url) syntax, handling escaped brackets
+  // This regex avoids matching image syntax ![text](url)
+  const linkRegex = /(?<!!)\[(?:[^\]\\]|\\.)*\]\(([^)]+)\)/g;
+  let match;
+  
+  while ((match = linkRegex.exec(markdown)) !== null) {
+    const url = match[1].trim();
+    // Extract just the URL part, ignoring title text after space
+    const urlPart = url.split(/\s+/)[0];
+    if (urlPart) {
+      urls.push(urlPart);
+    }
+  }
+  
+  return urls;
+}
+
+/**
+ * Compares two URLs for strict origin matching.
+ * 
+ * @param {string} url1 - First URL to compare
+ * @param {string} url2 - Second URL to compare
+ * @returns {boolean} - True if origins match strictly (protocol, hostname, and port)
+ */
+function isSameOrigin(url1, url2) {
+  try {
+    const parsed1 = new URL(url1);
+    const parsed2 = new URL(url2);
+    
+    // Compare protocol, hostname, and port
+    return (
+      parsed1.protocol === parsed2.protocol &&
+      parsed1.hostname === parsed2.hostname &&
+      parsed1.port === parsed2.port
+    );
+  } catch (error) {
+    // If URL parsing fails, they can't be same origin
+    return false;
+  }
+}
+
+/**
+ * Resolves a relative URL against a base origin.
+ * 
+ * @param {string} relativeUrl - The relative URL to resolve
+ * @param {string} baseOrigin - The origin to resolve against
+ * @returns {string|null} - Resolved absolute URL or null if resolution fails
+ */
+function resolveRelativeUrl(relativeUrl, baseOrigin) {
+  try {
+    // Check if it's already absolute
+    new URL(relativeUrl);
+    return relativeUrl;
+  } catch {
+    // It's relative, try to resolve
+    try {
+      const resolved = new URL(relativeUrl, baseOrigin);
+      return resolved.href;
+    } catch (error) {
+      return null;
+    }
+  }
+}
+
+/**
+ * Crawls URLs starting from initial inputs, discovering additional URLs by following links.
+ * 
+ * @param {Object} options - Crawling options
+ * @param {Object} options.config - Configuration object
+ * @param {string[]} options.initialUrls - Array of initial URLs to crawl
+ * @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
+ */
+async function crawlUrls({ config, initialUrls }) {
+  const visitedUrls = new Set();
+  const discoveredUrls = [];
+  const MAX_URLS = 10000;
+  let urlQueue = [...initialUrls];
+  
+  // Process each URL in the queue
+  while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) {
+    const currentUrl = urlQueue.shift();
+    
+    // Skip if already visited
+    if (visitedUrls.has(currentUrl)) {
+      continue;
+    }
+    
+    visitedUrls.add(currentUrl);
+    discoveredUrls.push(currentUrl);
+    
+    log(config, "debug", `Crawling: ${currentUrl}`);
+    
+    // Fetch the URL content
+    let content;
+    try {
+      const response = await axios.get(currentUrl, {
+        timeout: 30000,
+        maxRedirects: 5,
+      });
+      content = response.data;
+    } catch (error) {
+      log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
+      continue;
+    }
+    
+    // Extract URLs based on content type
+    let extractedUrls = [];
+    if (typeof content === "string") {
+      // Try both HTML and Markdown extraction
+      extractedUrls = [
+        ...extractHtmlUrls(content),
+        ...extractMarkdownUrls(content),
+      ];
+    }
+    
+    // Process extracted URLs
+    for (const url of extractedUrls) {
+      let absoluteUrl;
+      
+      // Check if URL is relative
+      try {
+        new URL(url);
+        absoluteUrl = url;
+      } catch {
+        // It's relative
+        if (config.origin) {
+          absoluteUrl = resolveRelativeUrl(url, config.origin);
+          if (!absoluteUrl) {
+            continue; // Skip malformed URLs
+          }
+        } else {
+          // No origin configured, skip relative URLs
+          log(
+            config,
+            "debug",
+            `Skipping relative URL (no origin configured): ${url}`
+          );
+          continue;
+        }
+      }
+      
+      // Check if same origin as current URL
+      if (isSameOrigin(absoluteUrl, currentUrl)) {
+        if (!visitedUrls.has(absoluteUrl)) {
+          urlQueue.push(absoluteUrl);
+        }
+      }
+    }
+  }
+  
+  // Log warning if limit reached
+  if (discoveredUrls.length >= MAX_URLS) {
+    log(
+      config,
+      "warn",
+      `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
+    );
+  }
+  
+  return discoveredUrls;
+}
diff --git a/src/crawler.test.js b/src/crawler.test.js
new file mode 100644
index 0000000..0b17ca5
--- /dev/null
+++ b/src/crawler.test.js
@@ -0,0 +1,491 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+  const { expect } = await import("chai");
+  global.expect = expect;
+});
+
+describe("crawler", function () {
+  describe("extractHtmlUrls", function () {
+    let extractHtmlUrls;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      extractHtmlUrls = crawler.extractHtmlUrls;
+    });
+
+    it("should extract single URL from HTML", function () {
+      const html = '<a href="https://example.com/page1">Link</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should extract multiple URLs from HTML", function () {
+      const html = `
+        <a href="https://example.com/page1">Link 1</a>
+        <a href="https://example.com/page2">Link 2</a>
+        <a href="https://example.com/page3">Link 3</a>
+      `;
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+      ]);
+    });
+
+    it("should handle single and double quotes", function () {
+      const html = `
+        <a href="https://example.com/page1">Link 1</a>
+        <a href='https://example.com/page2'>Link 2</a>
+      `;
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+    });
+
+    it("should ignore anchor links", function () {
+      const html = '<a href="#">Anchor</a><a href="https://example.com">Link</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["https://example.com"]);
+    });
+
+    it("should ignore javascript: links", function () {
+      const html = '<a href="javascript:void(0)">JS Link</a><a href="https://example.com">Link</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["https://example.com"]);
+    });
+
+    it("should handle empty string", function () {
+      const urls = extractHtmlUrls("");
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle non-string input", function () {
+      const urls = extractHtmlUrls(null);
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should extract relative URLs", function () {
+      const html = '<a href="/page1">Relative</a><a href="https://example.com">Absolute</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["/page1", "https://example.com"]);
+    });
+  });
+
+  describe("extractMarkdownUrls", function () {
+    let extractMarkdownUrls;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      extractMarkdownUrls = crawler.extractMarkdownUrls;
+    });
+
+    it("should extract single URL from Markdown", function () {
+      const markdown = "[Link](https://example.com/page1)";
+      const urls = extractMarkdownUrls(markdown);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should extract multiple URLs from Markdown", function () {
+      const markdown = `
+        [Link 1](https://example.com/page1)
+        [Link 2](https://example.com/page2)
+        [Link 3](https://example.com/page3)
+      `;
+      const urls = extractMarkdownUrls(markdown);
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+      ]);
+    });
+
+    it("should ignore image syntax", function () {
+      const markdown = "![Image](https://example.com/image.png) [Link](https://example.com/page1)";
+      const urls = extractMarkdownUrls(markdown);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should handle URLs with title text", function () {
+      const markdown = '[Link](https://example.com/page1 "Title text")';
+      const urls = extractMarkdownUrls(markdown);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should handle empty string", function () {
+      const urls = extractMarkdownUrls("");
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle non-string input", function () {
+      const urls = extractMarkdownUrls(null);
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should extract relative URLs", function () {
+      const markdown = "[Relative](/page1) [Absolute](https://example.com)";
+      const urls = extractMarkdownUrls(markdown);
+      expect(urls).to.deep.equal(["/page1", "https://example.com"]);
+    });
+  });
+
+  describe("isSameOrigin", function () {
+    let isSameOrigin;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      isSameOrigin = crawler.isSameOrigin;
+    });
+
+    it("should return true for same protocol, domain, and port", function () {
+      const result = isSameOrigin(
+        "https://example.com:443/page1",
+        "https://example.com:443/page2"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should return true for same origin with default ports", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://example.com/page2"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should return false for different protocol", function () {
+      const result = isSameOrigin(
+        "http://example.com/page1",
+        "https://example.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for different domain", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://other.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for different port", function () {
+      const result = isSameOrigin(
+        "https://example.com:443/page1",
+        "https://example.com:8080/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for subdomain differences", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://subdomain.example.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for malformed URLs", function () {
+      const result = isSameOrigin("not a url", "https://example.com");
+      expect(result).to.be.false;
+    });
+
+    it("should handle query parameters", function () {
+      const result = isSameOrigin(
+        "https://example.com/page?foo=bar",
+        "https://example.com/page?baz=qux"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should handle fragments", function () {
+      const result = isSameOrigin(
+        "https://example.com/page#section1",
+        "https://example.com/page#section2"
+      );
+      expect(result).to.be.true;
+    });
+  });
+
+  describe("resolveRelativeUrl", function () {
+    let resolveRelativeUrl;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      resolveRelativeUrl = crawler.resolveRelativeUrl;
+    });
+
+    it("should resolve relative path against origin", function () {
+      const result = resolveRelativeUrl(
+        "/page1",
+        "https://example.com"
+      );
+      expect(result).to.equal("https://example.com/page1");
+    });
+
+    it("should resolve relative path with ../ navigation", function () {
+      const result = resolveRelativeUrl(
+        "../page1",
+        "https://example.com/dir/subdir/"
+      );
+      expect(result).to.equal("https://example.com/dir/page1");
+    });
+
+    it("should resolve absolute path starting with /", function () {
+      const result = resolveRelativeUrl(
+        "/absolute/path",
+        "https://example.com/some/dir"
+      );
+      expect(result).to.equal("https://example.com/absolute/path");
+    });
+
+    it("should return null for malformed relative URLs", function () {
+      // Note: URL constructor is quite forgiving, so we need a truly malformed URL
+      // In practice, most strings can be parsed as relative URLs
+      const result = resolveRelativeUrl(
+        "",
+        "not a valid base"
+      );
+      expect(result).to.be.null;
+    });
+
+    it("should return absolute URL unchanged", function () {
+      const result = resolveRelativeUrl(
+        "https://other.com/page",
+        "https://example.com"
+      );
+      expect(result).to.equal("https://other.com/page");
+    });
+
+    it("should handle query parameters in relative URLs", function () {
+      const result = resolveRelativeUrl(
+        "/page?foo=bar",
+        "https://example.com"
+      );
+      expect(result).to.equal("https://example.com/page?foo=bar");
+    });
+
+    it("should handle fragments in relative URLs", function () {
+      const result = resolveRelativeUrl(
+        "/page#section",
+        "https://example.com"
+      );
+      expect(result).to.equal("https://example.com/page#section");
+    });
+  });
+
+  describe("crawlUrls", function () {
+    let crawlUrls, axiosStub, logStub;
+
+    beforeEach(function () {
+      axiosStub = {
+        get: sinon.stub(),
+      };
+      logStub = sinon.stub();
+
+      const crawlerModule = proxyquire("./crawler", {
+        axios: axiosStub,
+        "./utils": { log: logStub },
+      });
+      crawlUrls = crawlerModule.crawlUrls;
+    });
+
+    afterEach(function () {
+      sinon.restore();
+    });
+
+    it("should crawl single URL with no links", async function () {
+      const config = { logLevel: "info" };
+      axiosStub.get.resolves({ data: "<html><body>No links</body></html>" });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+      expect(axiosStub.get.calledOnce).to.be.true;
+    });
+
+    it("should crawl same-origin links", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get
+        .withArgs("https://example.com/page1")
+        .resolves({
+          data: '<html><a href="https://example.com/page2">Link</a></html>',
+        });
+      
+      axiosStub.get
+        .withArgs("https://example.com/page2")
+        .resolves({
+          data: "<html>No more links</html>",
+        });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+      expect(axiosStub.get.calledTwice).to.be.true;
+    });
+
+    it("should not crawl cross-origin links", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get.resolves({
+        data: '<html><a href="https://other.com/page">External</a></html>',
+      });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+      expect(axiosStub.get.calledOnce).to.be.true;
+    });
+
+    it("should deduplicate URLs", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get
+        .withArgs("https://example.com/page1")
+        .resolves({
+          data: '<html><a href="https://example.com/page2">Link</a></html>',
+        });
+      
+      axiosStub.get
+        .withArgs("https://example.com/page2")
+        .resolves({
+          data: '<html><a href="https://example.com/page1">Back</a></html>',
+        });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+      expect(axiosStub.get.calledTwice).to.be.true;
+    });
+
+    it("should handle fetch errors gracefully", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get
+        .withArgs("https://example.com/page1")
+        .resolves({
+          data: '<html><a href="https://example.com/page2">Link</a></html>',
+        });
+      
+      axiosStub.get
+        .withArgs("https://example.com/page2")
+        .rejects(new Error("404 Not Found"));
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+      expect(logStub.calledWith(config, "warn")).to.be.true;
+    });
+
+    it("should resolve relative URLs with origin config", async function () {
+      const config = { logLevel: "info", origin: "https://example.com" };
+      
+      axiosStub.get
+        .withArgs("https://example.com/page1")
+        .resolves({
+          data: '<html><a href="/page2">Relative Link</a></html>',
+        });
+      
+      axiosStub.get
+        .withArgs("https://example.com/page2")
+        .resolves({
+          data: "<html>No more links</html>",
+        });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+    });
+
+    it("should skip relative URLs without origin config", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get.resolves({
+        data: '<html><a href="/page2">Relative Link</a></html>',
+      });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+      expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true;
+    });
+
+    it("should extract URLs from Markdown content", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get
+        .withArgs("https://example.com/page1")
+        .resolves({
+          data: "[Link](https://example.com/page2)",
+        });
+      
+      axiosStub.get
+        .withArgs("https://example.com/page2")
+        .resolves({
+          data: "No more links",
+        });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+    });
+
+    it("should handle non-string content", async function () {
+      const config = { logLevel: "info" };
+      
+      axiosStub.get.resolves({ data: { json: "object" } });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page1"],
+      });
+
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+  });
+});
diff --git a/src/utils.js b/src/utils.js
index 8b0a61c..fb6ee29 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,6 +11,7 @@ const {
   transformToSchemaKey,
   readFile,
 } = require("doc-detective-common");
+const { crawlUrls } = require("./crawler");
 
 exports.qualifyFiles = qualifyFiles;
 exports.parseTests = parseTests;
@@ -184,6 +185,57 @@ async function qualifyFiles({ config }) {
   const cleanup = config.afterAll;
   if (cleanup) sequence = sequence.concat(cleanup);
 
+  // Determine if crawling is enabled
+  let shouldCrawl = false;
+  if (config.crawl !== undefined) {
+    // Explicit config setting takes precedence
+    shouldCrawl = config.crawl === true;
+  }
+
+  // Collect URLs that should be crawled
+  const urlsToCrawl = [];
+  for (const source of sequence) {
+    const isHttpUrl =
+      typeof source === "string" &&
+      (source.startsWith("http://") || source.startsWith("https://"));
+
+    if (isHttpUrl) {
+      // Determine if this specific URL should be crawled
+      let crawlThisUrl = shouldCrawl;
+      
+      // If crawl config is not explicitly set, use protocol-based default
+      if (config.crawl === undefined) {
+        crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default
+      }
+
+      if (crawlThisUrl) {
+        urlsToCrawl.push(source);
+      }
+    }
+  }
+
+  // Perform crawling if there are URLs to crawl
+  if (urlsToCrawl.length > 0) {
+    log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`);
+    try {
+      const crawledUrls = await crawlUrls({
+        config,
+        initialUrls: urlsToCrawl,
+      });
+      
+      // Add newly discovered URLs to the sequence
+      // Filter out URLs that were already in the initial sequence
+      const newUrls = crawledUrls.filter((url) => !sequence.includes(url));
+      log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`);
+      
+      // Add new URLs after the input section but before cleanup
+      const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
+      sequence.splice(cleanupStartIndex, 0, ...newUrls);
+    } catch (error) {
+      log(config, "error", `Crawling failed: ${error.message}`);
+    }
+  }
+
   for (let source of sequence) {
     log(config, "debug", `source: ${source}`);
     // Check if source is a URL

From c529cbf848bb48f25bcf9a47c1d648183f68369c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:23:37 +0000
Subject: [PATCH 3/9] Add XML sitemap support, URL limit test, and config
 validation handling

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/config.js       | 11 ++++++
 src/config.test.js  | 40 ++++++++++++++++++++++
 src/crawler.js      | 30 +++++++++++++++-
 src/crawler.test.js | 83 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 163 insertions(+), 1 deletion(-)

diff --git a/src/config.js b/src/config.js
index 72eb211..2ca814f 100644
--- a/src/config.js
+++ b/src/config.js
@@ -478,6 +478,12 @@ async function setConfig({ config }) {
     }
   }
 
+  // Extract non-schema fields before validation
+  const nonSchemaFields = {
+    crawl: config.crawl,
+  };
+  delete config.crawl;
+
   // Validate inbound `config`.
   const validityCheck = validate({ schemaKey: "config_v3", object: config });
   if (!validityCheck.valid) {
@@ -491,6 +497,11 @@ async function setConfig({ config }) {
   }
   config = validityCheck.object;
 
+  // Restore non-schema fields after validation
+  if (nonSchemaFields.crawl !== undefined) {
+    config.crawl = nonSchemaFields.crawl;
+  }
+
   // Replace fileType strings with objects
   config.fileTypes = config.fileTypes.map((fileType) => {
     if (typeof fileType === "object") return fileType;
diff --git a/src/config.test.js b/src/config.test.js
index 3880c5b..610df4b 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
     expect(result.concurrentRunners).to.equal(4);
   });
 });
+
+describe("crawl config field", function () {
+  it("should preserve crawl field through validation", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: true,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.equal(true);
+  });
+
+  it("should handle crawl field set to false", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: false,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.equal(false);
+  });
+
+  it("should handle missing crawl field", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.be.undefined;
+  });
+});
diff --git a/src/crawler.js b/src/crawler.js
index 62c2ed5..4b442dc 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -3,6 +3,7 @@ const { log } = require("./utils");
 
 exports.extractHtmlUrls = extractHtmlUrls;
 exports.extractMarkdownUrls = extractMarkdownUrls;
+exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
 exports.isSameOrigin = isSameOrigin;
 exports.resolveRelativeUrl = resolveRelativeUrl;
 exports.crawlUrls = crawlUrls;
@@ -63,6 +64,32 @@ function extractMarkdownUrls(markdown) {
   return urls;
 }
 
+/**
+ * Extracts URLs from XML sitemap.
+ * 
+ * @param {string} xml - The XML sitemap content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractXmlSitemapUrls(xml) {
+  if (typeof xml !== "string") {
+    return [];
+  }
+  
+  const urls = [];
+  // Match <loc> tags in XML sitemaps
+  const locRegex = /<loc>([^<]+)<\/loc>/gi;
+  let match;
+  
+  while ((match = locRegex.exec(xml)) !== null) {
+    const url = match[1].trim();
+    if (url) {
+      urls.push(url);
+    }
+  }
+  
+  return urls;
+}
+
 /**
  * Compares two URLs for strict origin matching.
  * 
@@ -154,10 +181,11 @@ async function crawlUrls({ config, initialUrls }) {
     // Extract URLs based on content type
     let extractedUrls = [];
     if (typeof content === "string") {
-      // Try both HTML and Markdown extraction
+      // Try HTML, Markdown, and XML sitemap extraction
       extractedUrls = [
         ...extractHtmlUrls(content),
         ...extractMarkdownUrls(content),
+        ...extractXmlSitemapUrls(content),
       ];
     }
     
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 0b17ca5..74432d5 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -134,6 +134,63 @@ describe("crawler", function () {
     });
   });
 
+  describe("extractXmlSitemapUrls", function () {
+    let extractXmlSitemapUrls;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      extractXmlSitemapUrls = crawler.extractXmlSitemapUrls;
+    });
+
+    it("should extract single URL from XML sitemap", function () {
+      const xml = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url>
+            <loc>https://example.com/page1</loc>
+          </url>
+        </urlset>`;
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should extract multiple URLs from XML sitemap", function () {
+      const xml = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url>
+            <loc>https://example.com/page1</loc>
+          </url>
+          <url>
+            <loc>https://example.com/page2</loc>
+          </url>
+          <url>
+            <loc>https://example.com/page3</loc>
+          </url>
+        </urlset>`;
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+      ]);
+    });
+
+    it("should handle empty string", function () {
+      const urls = extractXmlSitemapUrls("");
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle non-string input", function () {
+      const urls = extractXmlSitemapUrls(null);
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle XML without loc tags", function () {
+      const xml = "<?xml version=\"1.0\"?><root><item>test</item></root>";
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal([]);
+    });
+  });
+
   describe("isSameOrigin", function () {
     let isSameOrigin;
 
@@ -487,5 +544,31 @@ describe("crawler", function () {
 
       expect(urls).to.deep.equal(["https://example.com/page1"]);
     });
+
+    it("should enforce 10,000 URL limit", async function () {
+      const config = { logLevel: "info" };
+      
+      // Create a mock that generates many URLs
+      let callCount = 0;
+      axiosStub.get.callsFake(async (url) => {
+        callCount++;
+        if (callCount <= 10001) {
+          // Generate unique URLs
+          return {
+            data: `<html><a href="https://example.com/page${callCount}">Link</a></html>`,
+          };
+        }
+        return { data: "<html>No more links</html>" };
+      });
+
+      const urls = await crawlUrls({
+        config,
+        initialUrls: ["https://example.com/page0"],
+      });
+
+      // Should stop at 10,000 URLs
+      expect(urls.length).to.equal(10000);
+      expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true;
+    });
   });
 });

From 7a6ec961befd4e8180e926989fc0ac6177755878 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:27:25 +0000
Subject: [PATCH 4/9] Fix circular dependency by removing log import from
 crawler

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/crawler.js      | 15 +++++++++------
 src/crawler.test.js | 11 ++++++++++-
 src/utils.js        |  1 +
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/crawler.js b/src/crawler.js
index 4b442dc..0a5f5f7 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -1,5 +1,4 @@
 const axios = require("axios");
-const { log } = require("./utils");
 
 exports.extractHtmlUrls = extractHtmlUrls;
 exports.extractMarkdownUrls = extractMarkdownUrls;
@@ -143,9 +142,13 @@ function resolveRelativeUrl(relativeUrl, baseOrigin) {
  * @param {Object} options - Crawling options
  * @param {Object} options.config - Configuration object
  * @param {string[]} options.initialUrls - Array of initial URLs to crawl
+ * @param {Function} options.log - Logging function (optional)
  * @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
  */
-async function crawlUrls({ config, initialUrls }) {
+async function crawlUrls({ config, initialUrls, log }) {
+  // Default no-op logger if not provided
+  const logger = log || (() => {});
+  
   const visitedUrls = new Set();
   const discoveredUrls = [];
   const MAX_URLS = 10000;
@@ -163,7 +166,7 @@ async function crawlUrls({ config, initialUrls }) {
     visitedUrls.add(currentUrl);
     discoveredUrls.push(currentUrl);
     
-    log(config, "debug", `Crawling: ${currentUrl}`);
+    logger(config, "debug", `Crawling: ${currentUrl}`);
     
     // Fetch the URL content
     let content;
@@ -174,7 +177,7 @@ async function crawlUrls({ config, initialUrls }) {
       });
       content = response.data;
     } catch (error) {
-      log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
+      logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
       continue;
     }
     
@@ -206,7 +209,7 @@ async function crawlUrls({ config, initialUrls }) {
           }
         } else {
           // No origin configured, skip relative URLs
-          log(
+          logger(
             config,
             "debug",
             `Skipping relative URL (no origin configured): ${url}`
@@ -226,7 +229,7 @@ async function crawlUrls({ config, initialUrls }) {
   
   // Log warning if limit reached
   if (discoveredUrls.length >= MAX_URLS) {
-    log(
+    logger(
       config,
       "warn",
       `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 74432d5..6674f9c 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -347,7 +347,6 @@ describe("crawler", function () {
 
       const crawlerModule = proxyquire("./crawler", {
         axios: axiosStub,
-        "./utils": { log: logStub },
       });
       crawlUrls = crawlerModule.crawlUrls;
     });
@@ -363,6 +362,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -387,6 +387,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal([
@@ -406,6 +407,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -430,6 +432,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal([
@@ -455,6 +458,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal([
@@ -482,6 +486,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal([
@@ -500,6 +505,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -524,6 +530,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal([
@@ -540,6 +547,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page1"],
+        log: logStub,
       });
 
       expect(urls).to.deep.equal(["https://example.com/page1"]);
@@ -564,6 +572,7 @@ describe("crawler", function () {
       const urls = await crawlUrls({
         config,
         initialUrls: ["https://example.com/page0"],
+        log: logStub,
       });
 
       // Should stop at 10,000 URLs
diff --git a/src/utils.js b/src/utils.js
index fb6ee29..36912b7 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -221,6 +221,7 @@ async function qualifyFiles({ config }) {
       const crawledUrls = await crawlUrls({
         config,
         initialUrls: urlsToCrawl,
+        log,
       });
       
       // Add newly discovered URLs to the sequence

From e4cb209a23fe82cfd215b957e4d6a3f4b51cf203 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 30 Oct 2025 13:30:42 +0000
Subject: [PATCH 5/9] Add security filtering for unsafe URL schemes (vbscript,
 data)

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/crawler.js      | 10 +++++++++-
 src/crawler.test.js | 12 ++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/crawler.js b/src/crawler.js
index 0a5f5f7..77b78cf 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -26,7 +26,15 @@ function extractHtmlUrls(html) {
   
   while ((match = anchorRegex.exec(html)) !== null) {
     const url = match[1];
-    if (url && url !== "#" && !url.startsWith("javascript:")) {
+    // Filter out unsafe URL schemes
+    const lowerUrl = url.toLowerCase();
+    if (
+      url &&
+      url !== "#" &&
+      !lowerUrl.startsWith("javascript:") &&
+      !lowerUrl.startsWith("vbscript:") &&
+      !lowerUrl.startsWith("data:")
+    ) {
       urls.push(url);
     }
   }
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 6674f9c..63e51da 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -60,6 +60,18 @@ describe("crawler", function () {
       expect(urls).to.deep.equal(["https://example.com"]);
     });
 
+    it("should ignore vbscript: links", function () {
+      const html = '<a href="vbscript:alert(1)">VB Link</a><a href="https://example.com">Link</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["https://example.com"]);
+    });
+
+    it("should ignore data: links", function () {
+      const html = '<a href="data:text/html,<script>alert(1)</script>">Data Link</a><a href="https://example.com">Link</a>';
+      const urls = extractHtmlUrls(html);
+      expect(urls).to.deep.equal(["https://example.com"]);
+    });
+
     it("should handle empty string", function () {
       const urls = extractHtmlUrls("");
       expect(urls).to.deep.equal([]);

From 6c5378f6472c129f52478abb0893f967e2c6273f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 31 Oct 2025 00:16:51 +0000
Subject: [PATCH 6/9] Reduce scope to only support sitemap.xml URLs

- Remove HTML and Markdown URL extraction
- Only process URLs ending with sitemap.xml
- Simplify crawler to crawlSitemap function
- Update tests to reflect reduced scope
- 66 tests passing (down from 96)

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/crawler.integration.test.js |  95 ++++---
 src/crawler.js                  | 200 +++------------
 src/crawler.test.js             | 421 +++-----------------------------
 src/utils.js                    |  42 ++--
 4 files changed, 128 insertions(+), 630 deletions(-)

diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
index d6f8797..7d4ab9e 100644
--- a/src/crawler.integration.test.js
+++ b/src/crawler.integration.test.js
@@ -8,7 +8,7 @@ before(async function () {
 });
 
 describe("crawler integration", function () {
-  let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub;
+  let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;
 
   beforeEach(function () {
     axiosStub = {
@@ -23,18 +23,19 @@ describe("crawler integration", function () {
       writeFileSync: sinon.stub(),
     };
     
-    logStub = sinon.stub();
-    crawlUrlsStub = sinon.stub();
+    crawlSitemapStub = sinon.stub();
     readFileStub = sinon.stub().resolves({});
     
     // Mock fetchFile behavior
     axiosStub.get.callsFake(async (url) => {
-      if (url === "https://example.com/page1") {
+      if (url.endsWith("sitemap.xml")) {
         return {
-          data: '<html><a href="https://example.com/page2">Link</a></html>',
+          data: `<?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+              <url><loc>https://example.com/page1</loc></url>
+              <url><loc>https://example.com/page2</loc></url>
+            </urlset>`,
         };
-      } else if (url === "https://example.com/page2") {
-        return { data: "<html>Content</html>" };
       }
       return { data: "" };
     });
@@ -42,7 +43,7 @@ describe("crawler integration", function () {
     const utilsModule = proxyquire("./utils", {
       axios: axiosStub,
       fs: fsStub,
-      "./crawler": { crawlUrls: crawlUrlsStub },
+      "./crawler": { crawlSitemap: crawlSitemapStub },
       "doc-detective-common": {
         validate: () => ({ valid: true }),
         resolvePaths: (x) => x,
@@ -58,14 +59,14 @@ describe("crawler integration", function () {
     sinon.restore();
   });
 
-  it("should enable crawling by default for HTTP URLs", async function () {
+  it("should process sitemap.xml URLs by default", async function () {
     const config = {
-      input: ["https://example.com/page1"],
+      input: ["https://example.com/sitemap.xml"],
       logLevel: "info",
       fileTypes: [],
     };
     
-    crawlUrlsStub.resolves([
+    crawlSitemapStub.resolves([
       "https://example.com/page1",
       "https://example.com/page2",
     ]);
@@ -76,16 +77,13 @@ describe("crawler integration", function () {
     
     await qualifyFiles({ config });
     
-    expect(crawlUrlsStub.calledOnce).to.be.true;
-    expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([
-      "https://example.com/page1",
-    ]);
+    expect(crawlSitemapStub.calledOnce).to.be.true;
+    expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
   });
 
-  it("should disable crawling when crawl is false", async function () {
+  it("should not process non-sitemap URLs", async function () {
     const config = {
-      input: ["https://example.com/page1"],
-      crawl: false,
+      input: ["https://example.com/page.html"],
       logLevel: "info",
       fileTypes: [],
     };
@@ -96,78 +94,69 @@ describe("crawler integration", function () {
     
     await qualifyFiles({ config });
     
-    expect(crawlUrlsStub.called).to.be.false;
+    expect(crawlSitemapStub.called).to.be.false;
   });
 
-  it("should enable crawling when crawl is true", async function () {
+  it("should disable processing when crawl is false", async function () {
     const config = {
-      input: ["https://example.com/page1"],
-      crawl: true,
+      input: ["https://example.com/sitemap.xml"],
+      crawl: false,
       logLevel: "info",
       fileTypes: [],
     };
     
-    crawlUrlsStub.resolves([
-      "https://example.com/page1",
-      "https://example.com/page2",
-    ]);
-    
     // Mock file system calls for fetched files
     fsStub.existsSync.returns(true);
     fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
     
     await qualifyFiles({ config });
     
-    expect(crawlUrlsStub.calledOnce).to.be.true;
+    expect(crawlSitemapStub.called).to.be.false;
   });
 
-  it("should not crawl file:// URLs by default", async function () {
+  it("should enable processing when crawl is true", async function () {
     const config = {
-      input: [],  // Empty input to avoid processing issues
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
       logLevel: "info",
       fileTypes: [],
     };
     
-    // file:// URLs won't trigger crawling since they don't start with http:// or https://
-    // This test just verifies no crawling happens
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
     
     await qualifyFiles({ config });
     
-    expect(crawlUrlsStub.called).to.be.false;
+    expect(crawlSitemapStub.calledOnce).to.be.true;
   });
 
-  it("should pass origin config to crawler", async function () {
+  it("should not process file:// URLs", async function () {
     const config = {
-      input: ["https://example.com/page1"],
-      origin: "https://example.com",
-      crawl: true,
+      input: [],
       logLevel: "info",
       fileTypes: [],
     };
     
-    crawlUrlsStub.resolves(["https://example.com/page1"]);
-    
-    // Mock file system calls for fetched files
-    fsStub.existsSync.returns(true);
-    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
-    
     await qualifyFiles({ config });
     
-    expect(crawlUrlsStub.calledOnce).to.be.true;
-    expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal(
-      "https://example.com"
-    );
+    expect(crawlSitemapStub.called).to.be.false;
   });
 
-  it("should log crawling activity", async function () {
+  it("should log sitemap processing activity", async function () {
     const config = {
-      input: ["https://example.com/page1"],
+      input: ["https://example.com/sitemap.xml"],
       crawl: true,
       logLevel: "info",
       fileTypes: [],
     };
     
-    crawlUrlsStub.resolves([
+    crawlSitemapStub.resolves([
       "https://example.com/page1",
       "https://example.com/page2",
     ]);
@@ -187,11 +176,11 @@ describe("crawler integration", function () {
     try {
       await qualifyFiles({ config });
       
-      // Check that crawling info was logged
-      const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling"));
+      // Check that processing info was logged
+      const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
       const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
       
-      expect(hasCrawlingLog).to.be.true;
+      expect(hasProcessingLog).to.be.true;
       expect(hasDiscoveredLog).to.be.true;
     } finally {
       console.log = originalConsoleLog;
diff --git a/src/crawler.js b/src/crawler.js
index 77b78cf..4549b77 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -1,75 +1,8 @@
 const axios = require("axios");
 
-exports.extractHtmlUrls = extractHtmlUrls;
-exports.extractMarkdownUrls = extractMarkdownUrls;
 exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
 exports.isSameOrigin = isSameOrigin;
-exports.resolveRelativeUrl = resolveRelativeUrl;
-exports.crawlUrls = crawlUrls;
-
-/**
- * Extracts URLs from HTML <a> tags with href attributes.
- * 
- * @param {string} html - The HTML content to parse
- * @returns {string[]} - Array of extracted URLs
- */
-function extractHtmlUrls(html) {
-  if (typeof html !== "string") {
-    return [];
-  }
-  
-  const urls = [];
-  // Match <a> tags with href attributes
-  // This regex handles various formats: href="url", href='url', href=url
-  const anchorRegex = /<a\s+(?:[^>]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi;
-  let match;
-  
-  while ((match = anchorRegex.exec(html)) !== null) {
-    const url = match[1];
-    // Filter out unsafe URL schemes
-    const lowerUrl = url.toLowerCase();
-    if (
-      url &&
-      url !== "#" &&
-      !lowerUrl.startsWith("javascript:") &&
-      !lowerUrl.startsWith("vbscript:") &&
-      !lowerUrl.startsWith("data:")
-    ) {
-      urls.push(url);
-    }
-  }
-  
-  return urls;
-}
-
-/**
- * Extracts URLs from Markdown [text](url) syntax.
- * 
- * @param {string} markdown - The Markdown content to parse
- * @returns {string[]} - Array of extracted URLs
- */
-function extractMarkdownUrls(markdown) {
-  if (typeof markdown !== "string") {
-    return [];
-  }
-  
-  const urls = [];
-  // Match [text](url) syntax, handling escaped brackets
-  // This regex avoids matching image syntax ![text](url)
-  const linkRegex = /(?<!!)\[(?:[^\]\\]|\\.)*\]\(([^)]+)\)/g;
-  let match;
-  
-  while ((match = linkRegex.exec(markdown)) !== null) {
-    const url = match[1].trim();
-    // Extract just the URL part, ignoring title text after space
-    const urlPart = url.split(/\s+/)[0];
-    if (urlPart) {
-      urls.push(urlPart);
-    }
-  }
-  
-  return urls;
-}
+exports.crawlSitemap = crawlSitemap;
 
 /**
  * Extracts URLs from XML sitemap.
@@ -122,127 +55,50 @@ function isSameOrigin(url1, url2) {
 }
 
 /**
- * Resolves a relative URL against a base origin.
- * 
- * @param {string} relativeUrl - The relative URL to resolve
- * @param {string} baseOrigin - The origin to resolve against
- * @returns {string|null} - Resolved absolute URL or null if resolution fails
- */
-function resolveRelativeUrl(relativeUrl, baseOrigin) {
-  try {
-    // Check if it's already absolute
-    new URL(relativeUrl);
-    return relativeUrl;
-  } catch {
-    // It's relative, try to resolve
-    try {
-      const resolved = new URL(relativeUrl, baseOrigin);
-      return resolved.href;
-    } catch (error) {
-      return null;
-    }
-  }
-}
-
-/**
- * Crawls URLs starting from initial inputs, discovering additional URLs by following links.
+ * Processes an XML sitemap and extracts all URLs.
  * 
  * @param {Object} options - Crawling options
  * @param {Object} options.config - Configuration object
- * @param {string[]} options.initialUrls - Array of initial URLs to crawl
+ * @param {string} options.sitemapUrl - URL of the sitemap to process
  * @param {Function} options.log - Logging function (optional)
  * @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
  */
-async function crawlUrls({ config, initialUrls, log }) {
+async function crawlSitemap({ config, sitemapUrl, log }) {
   // Default no-op logger if not provided
   const logger = log || (() => {});
   
-  const visitedUrls = new Set();
   const discoveredUrls = [];
-  const MAX_URLS = 10000;
-  let urlQueue = [...initialUrls];
   
-  // Process each URL in the queue
-  while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) {
-    const currentUrl = urlQueue.shift();
-    
-    // Skip if already visited
-    if (visitedUrls.has(currentUrl)) {
-      continue;
-    }
-    
-    visitedUrls.add(currentUrl);
-    discoveredUrls.push(currentUrl);
-    
-    logger(config, "debug", `Crawling: ${currentUrl}`);
-    
-    // Fetch the URL content
-    let content;
-    try {
-      const response = await axios.get(currentUrl, {
-        timeout: 30000,
-        maxRedirects: 5,
-      });
-      content = response.data;
-    } catch (error) {
-      logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`);
-      continue;
-    }
-    
-    // Extract URLs based on content type
-    let extractedUrls = [];
-    if (typeof content === "string") {
-      // Try HTML, Markdown, and XML sitemap extraction
-      extractedUrls = [
-        ...extractHtmlUrls(content),
-        ...extractMarkdownUrls(content),
-        ...extractXmlSitemapUrls(content),
-      ];
-    }
+  logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);
+  
+  // Fetch the sitemap content
+  let content;
+  try {
+    const response = await axios.get(sitemapUrl, {
+      timeout: 30000,
+      maxRedirects: 5,
+    });
+    content = response.data;
+  } catch (error) {
+    logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
+    return discoveredUrls;
+  }
+  
+  // Extract URLs from sitemap
+  if (typeof content === "string") {
+    const extractedUrls = extractXmlSitemapUrls(content);
     
-    // Process extracted URLs
+    // Filter URLs to only include same-origin URLs
     for (const url of extractedUrls) {
-      let absoluteUrl;
-      
-      // Check if URL is relative
-      try {
-        new URL(url);
-        absoluteUrl = url;
-      } catch {
-        // It's relative
-        if (config.origin) {
-          absoluteUrl = resolveRelativeUrl(url, config.origin);
-          if (!absoluteUrl) {
-            continue; // Skip malformed URLs
-          }
-        } else {
-          // No origin configured, skip relative URLs
-          logger(
-            config,
-            "debug",
-            `Skipping relative URL (no origin configured): ${url}`
-          );
-          continue;
-        }
-      }
-      
-      // Check if same origin as current URL
-      if (isSameOrigin(absoluteUrl, currentUrl)) {
-        if (!visitedUrls.has(absoluteUrl)) {
-          urlQueue.push(absoluteUrl);
-        }
+      if (isSameOrigin(url, sitemapUrl)) {
+        discoveredUrls.push(url);
+      } else {
+        logger(config, "debug", `Skipping cross-origin URL: ${url}`);
       }
     }
   }
   
-  // Log warning if limit reached
-  if (discoveredUrls.length >= MAX_URLS) {
-    logger(
-      config,
-      "warn",
-      `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs`
-    );
-  }
+  logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);
   
   return discoveredUrls;
 }
diff --git a/src/crawler.test.js b/src/crawler.test.js
index 63e51da..54a07e8 100644
--- a/src/crawler.test.js
+++ b/src/crawler.test.js
@@ -8,144 +8,6 @@ before(async function () {
 });
 
 describe("crawler", function () {
-  describe("extractHtmlUrls", function () {
-    let extractHtmlUrls;
-
-    beforeEach(function () {
-      const crawler = require("./crawler");
-      extractHtmlUrls = crawler.extractHtmlUrls;
-    });
-
-    it("should extract single URL from HTML", function () {
-      const html = '<a href="https://example.com/page1">Link</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-    });
-
-    it("should extract multiple URLs from HTML", function () {
-      const html = `
-        <a href="https://example.com/page1">Link 1</a>
-        <a href="https://example.com/page2">Link 2</a>
-        <a href="https://example.com/page3">Link 3</a>
-      `;
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-        "https://example.com/page3",
-      ]);
-    });
-
-    it("should handle single and double quotes", function () {
-      const html = `
-        <a href="https://example.com/page1">Link 1</a>
-        <a href='https://example.com/page2'>Link 2</a>
-      `;
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-      ]);
-    });
-
-    it("should ignore anchor links", function () {
-      const html = '<a href="#">Anchor</a><a href="https://example.com">Link</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["https://example.com"]);
-    });
-
-    it("should ignore javascript: links", function () {
-      const html = '<a href="javascript:void(0)">JS Link</a><a href="https://example.com">Link</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["https://example.com"]);
-    });
-
-    it("should ignore vbscript: links", function () {
-      const html = '<a href="vbscript:alert(1)">VB Link</a><a href="https://example.com">Link</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["https://example.com"]);
-    });
-
-    it("should ignore data: links", function () {
-      const html = '<a href="data:text/html,<script>alert(1)</script>">Data Link</a><a href="https://example.com">Link</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["https://example.com"]);
-    });
-
-    it("should handle empty string", function () {
-      const urls = extractHtmlUrls("");
-      expect(urls).to.deep.equal([]);
-    });
-
-    it("should handle non-string input", function () {
-      const urls = extractHtmlUrls(null);
-      expect(urls).to.deep.equal([]);
-    });
-
-    it("should extract relative URLs", function () {
-      const html = '<a href="/page1">Relative</a><a href="https://example.com">Absolute</a>';
-      const urls = extractHtmlUrls(html);
-      expect(urls).to.deep.equal(["/page1", "https://example.com"]);
-    });
-  });
-
-  describe("extractMarkdownUrls", function () {
-    let extractMarkdownUrls;
-
-    beforeEach(function () {
-      const crawler = require("./crawler");
-      extractMarkdownUrls = crawler.extractMarkdownUrls;
-    });
-
-    it("should extract single URL from Markdown", function () {
-      const markdown = "[Link](https://example.com/page1)";
-      const urls = extractMarkdownUrls(markdown);
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-    });
-
-    it("should extract multiple URLs from Markdown", function () {
-      const markdown = `
-        [Link 1](https://example.com/page1)
-        [Link 2](https://example.com/page2)
-        [Link 3](https://example.com/page3)
-      `;
-      const urls = extractMarkdownUrls(markdown);
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-        "https://example.com/page3",
-      ]);
-    });
-
-    it("should ignore image syntax", function () {
-      const markdown = "![Image](https://example.com/image.png) [Link](https://example.com/page1)";
-      const urls = extractMarkdownUrls(markdown);
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-    });
-
-    it("should handle URLs with title text", function () {
-      const markdown = '[Link](https://example.com/page1 "Title text")';
-      const urls = extractMarkdownUrls(markdown);
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-    });
-
-    it("should handle empty string", function () {
-      const urls = extractMarkdownUrls("");
-      expect(urls).to.deep.equal([]);
-    });
-
-    it("should handle non-string input", function () {
-      const urls = extractMarkdownUrls(null);
-      expect(urls).to.deep.equal([]);
-    });
-
-    it("should extract relative URLs", function () {
-      const markdown = "[Relative](/page1) [Absolute](https://example.com)";
-      const urls = extractMarkdownUrls(markdown);
-      expect(urls).to.deep.equal(["/page1", "https://example.com"]);
-    });
-  });
-
   describe("extractXmlSitemapUrls", function () {
     let extractXmlSitemapUrls;
 
@@ -281,75 +143,8 @@ describe("crawler", function () {
     });
   });
 
-  describe("resolveRelativeUrl", function () {
-    let resolveRelativeUrl;
-
-    beforeEach(function () {
-      const crawler = require("./crawler");
-      resolveRelativeUrl = crawler.resolveRelativeUrl;
-    });
-
-    it("should resolve relative path against origin", function () {
-      const result = resolveRelativeUrl(
-        "/page1",
-        "https://example.com"
-      );
-      expect(result).to.equal("https://example.com/page1");
-    });
-
-    it("should resolve relative path with ../ navigation", function () {
-      const result = resolveRelativeUrl(
-        "../page1",
-        "https://example.com/dir/subdir/"
-      );
-      expect(result).to.equal("https://example.com/dir/page1");
-    });
-
-    it("should resolve absolute path starting with /", function () {
-      const result = resolveRelativeUrl(
-        "/absolute/path",
-        "https://example.com/some/dir"
-      );
-      expect(result).to.equal("https://example.com/absolute/path");
-    });
-
-    it("should return null for malformed relative URLs", function () {
-      // Note: URL constructor is quite forgiving, so we need a truly malformed URL
-      // In practice, most strings can be parsed as relative URLs
-      const result = resolveRelativeUrl(
-        "",
-        "not a valid base"
-      );
-      expect(result).to.be.null;
-    });
-
-    it("should return absolute URL unchanged", function () {
-      const result = resolveRelativeUrl(
-        "https://other.com/page",
-        "https://example.com"
-      );
-      expect(result).to.equal("https://other.com/page");
-    });
-
-    it("should handle query parameters in relative URLs", function () {
-      const result = resolveRelativeUrl(
-        "/page?foo=bar",
-        "https://example.com"
-      );
-      expect(result).to.equal("https://example.com/page?foo=bar");
-    });
-
-    it("should handle fragments in relative URLs", function () {
-      const result = resolveRelativeUrl(
-        "/page#section",
-        "https://example.com"
-      );
-      expect(result).to.equal("https://example.com/page#section");
-    });
-  });
-
-  describe("crawlUrls", function () {
-    let crawlUrls, axiosStub, logStub;
+  describe("crawlSitemap", function () {
+    let crawlSitemap, axiosStub, logStub;
 
     beforeEach(function () {
       axiosStub = {
@@ -360,45 +155,27 @@ describe("crawler", function () {
       const crawlerModule = proxyquire("./crawler", {
         axios: axiosStub,
       });
-      crawlUrls = crawlerModule.crawlUrls;
+      crawlSitemap = crawlerModule.crawlSitemap;
     });
 
     afterEach(function () {
       sinon.restore();
     });
 
-    it("should crawl single URL with no links", async function () {
-      const config = { logLevel: "info" };
-      axiosStub.get.resolves({ data: "<html><body>No links</body></html>" });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page1"],
-        log: logStub,
-      });
-
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-      expect(axiosStub.get.calledOnce).to.be.true;
-    });
-
-    it("should crawl same-origin links", async function () {
+    it("should process sitemap and extract same-origin URLs", async function () {
       const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      const sitemapContent = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/page1</loc></url>
+          <url><loc>https://example.com/page2</loc></url>
+        </urlset>`;
       
-      axiosStub.get
-        .withArgs("https://example.com/page1")
-        .resolves({
-          data: '<html><a href="https://example.com/page2">Link</a></html>',
-        });
-      
-      axiosStub.get
-        .withArgs("https://example.com/page2")
-        .resolves({
-          data: "<html>No more links</html>",
-        });
+      axiosStub.get.resolves({ data: sitemapContent });
 
-      const urls = await crawlUrls({
+      const urls = await crawlSitemap({
         config,
-        initialUrls: ["https://example.com/page1"],
+        sitemapUrl,
         log: logStub,
       });
 
@@ -406,190 +183,58 @@ describe("crawler", function () {
         "https://example.com/page1",
         "https://example.com/page2",
       ]);
-      expect(axiosStub.get.calledTwice).to.be.true;
-    });
-
-    it("should not crawl cross-origin links", async function () {
-      const config = { logLevel: "info" };
-      
-      axiosStub.get.resolves({
-        data: '<html><a href="https://other.com/page">External</a></html>',
-      });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page1"],
-        log: logStub,
-      });
-
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
       expect(axiosStub.get.calledOnce).to.be.true;
     });
 
-    it("should deduplicate URLs", async function () {
+    it("should filter out cross-origin URLs", async function () {
       const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      const sitemapContent = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/page1</loc></url>
+          <url><loc>https://other.com/page2</loc></url>
+        </urlset>`;
       
-      axiosStub.get
-        .withArgs("https://example.com/page1")
-        .resolves({
-          data: '<html><a href="https://example.com/page2">Link</a></html>',
-        });
-      
-      axiosStub.get
-        .withArgs("https://example.com/page2")
-        .resolves({
-          data: '<html><a href="https://example.com/page1">Back</a></html>',
-        });
+      axiosStub.get.resolves({ data: sitemapContent });
 
-      const urls = await crawlUrls({
+      const urls = await crawlSitemap({
         config,
-        initialUrls: ["https://example.com/page1"],
+        sitemapUrl,
         log: logStub,
       });
 
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-      ]);
-      expect(axiosStub.get.calledTwice).to.be.true;
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
     });
 
     it("should handle fetch errors gracefully", async function () {
       const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
       
-      axiosStub.get
-        .withArgs("https://example.com/page1")
-        .resolves({
-          data: '<html><a href="https://example.com/page2">Link</a></html>',
-        });
-      
-      axiosStub.get
-        .withArgs("https://example.com/page2")
-        .rejects(new Error("404 Not Found"));
+      axiosStub.get.rejects(new Error("404 Not Found"));
 
-      const urls = await crawlUrls({
+      const urls = await crawlSitemap({
         config,
-        initialUrls: ["https://example.com/page1"],
+        sitemapUrl,
         log: logStub,
       });
 
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-      ]);
+      expect(urls).to.deep.equal([]);
       expect(logStub.calledWith(config, "warn")).to.be.true;
     });
 
-    it("should resolve relative URLs with origin config", async function () {
-      const config = { logLevel: "info", origin: "https://example.com" };
-      
-      axiosStub.get
-        .withArgs("https://example.com/page1")
-        .resolves({
-          data: '<html><a href="/page2">Relative Link</a></html>',
-        });
-      
-      axiosStub.get
-        .withArgs("https://example.com/page2")
-        .resolves({
-          data: "<html>No more links</html>",
-        });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page1"],
-        log: logStub,
-      });
-
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-      ]);
-    });
-
-    it("should skip relative URLs without origin config", async function () {
-      const config = { logLevel: "info" };
-      
-      axiosStub.get.resolves({
-        data: '<html><a href="/page2">Relative Link</a></html>',
-      });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page1"],
-        log: logStub,
-      });
-
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-      expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true;
-    });
-
-    it("should extract URLs from Markdown content", async function () {
-      const config = { logLevel: "info" };
-      
-      axiosStub.get
-        .withArgs("https://example.com/page1")
-        .resolves({
-          data: "[Link](https://example.com/page2)",
-        });
-      
-      axiosStub.get
-        .withArgs("https://example.com/page2")
-        .resolves({
-          data: "No more links",
-        });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page1"],
-        log: logStub,
-      });
-
-      expect(urls).to.deep.equal([
-        "https://example.com/page1",
-        "https://example.com/page2",
-      ]);
-    });
-
     it("should handle non-string content", async function () {
       const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
       
       axiosStub.get.resolves({ data: { json: "object" } });
 
-      const urls = await crawlUrls({
+      const urls = await crawlSitemap({
         config,
-        initialUrls: ["https://example.com/page1"],
+        sitemapUrl,
         log: logStub,
       });
 
-      expect(urls).to.deep.equal(["https://example.com/page1"]);
-    });
-
-    it("should enforce 10,000 URL limit", async function () {
-      const config = { logLevel: "info" };
-      
-      // Create a mock that generates many URLs
-      let callCount = 0;
-      axiosStub.get.callsFake(async (url) => {
-        callCount++;
-        if (callCount <= 10001) {
-          // Generate unique URLs
-          return {
-            data: `<html><a href="https://example.com/page${callCount}">Link</a></html>`,
-          };
-        }
-        return { data: "<html>No more links</html>" };
-      });
-
-      const urls = await crawlUrls({
-        config,
-        initialUrls: ["https://example.com/page0"],
-        log: logStub,
-      });
-
-      // Should stop at 10,000 URLs
-      expect(urls.length).to.equal(10000);
-      expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true;
+      expect(urls).to.deep.equal([]);
     });
   });
 });
diff --git a/src/utils.js b/src/utils.js
index 36912b7..ab4fa47 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,7 +11,7 @@ const {
   transformToSchemaKey,
   readFile,
 } = require("doc-detective-common");
-const { crawlUrls } = require("./crawler");
+const { crawlSitemap } = require("./crawler");
 
 exports.qualifyFiles = qualifyFiles;
 exports.parseTests = parseTests;
@@ -192,48 +192,56 @@ async function qualifyFiles({ config }) {
     shouldCrawl = config.crawl === true;
   }
 
-  // Collect URLs that should be crawled
-  const urlsToCrawl = [];
+  // Collect sitemap.xml URLs that should be crawled
+  const sitemapsToProcess = [];
   for (const source of sequence) {
     const isHttpUrl =
       typeof source === "string" &&
       (source.startsWith("http://") || source.startsWith("https://"));
+    
+    const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
 
-    if (isHttpUrl) {
+    if (isHttpUrl && isSitemapUrl) {
       // Determine if this specific URL should be crawled
       let crawlThisUrl = shouldCrawl;
       
       // If crawl config is not explicitly set, use protocol-based default
       if (config.crawl === undefined) {
-        crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default
+        crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default
       }
 
       if (crawlThisUrl) {
-        urlsToCrawl.push(source);
+        sitemapsToProcess.push(source);
       }
     }
   }
 
-  // Perform crawling if there are URLs to crawl
-  if (urlsToCrawl.length > 0) {
-    log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`);
+  // Process sitemaps if there are any to crawl
+  if (sitemapsToProcess.length > 0) {
+    log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`);
     try {
-      const crawledUrls = await crawlUrls({
-        config,
-        initialUrls: urlsToCrawl,
-        log,
-      });
+      const allDiscoveredUrls = [];
+      
+      // Process each sitemap
+      for (const sitemapUrl of sitemapsToProcess) {
+        const discoveredUrls = await crawlSitemap({
+          config,
+          sitemapUrl,
+          log,
+        });
+        allDiscoveredUrls.push(...discoveredUrls);
+      }
       
       // Add newly discovered URLs to the sequence
       // Filter out URLs that were already in the initial sequence
-      const newUrls = crawledUrls.filter((url) => !sequence.includes(url));
-      log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`);
+      const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url));
+      log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`);
       
       // Add new URLs after the input section but before cleanup
       const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
       sequence.splice(cleanupStartIndex, 0, ...newUrls);
     } catch (error) {
-      log(config, "error", `Crawling failed: ${error.message}`);
+      log(config, "error", `Sitemap processing failed: ${error.message}`);
     }
   }
 

From 3f247a1dcb1676cdfe4005d017a903af03de77b3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 31 Oct 2025 00:38:07 +0000
Subject: [PATCH 7/9] Pin to doc-detective-common@3.4.0-dev.3 and remove crawl
 field workaround

- Update package.json to use doc-detective-common@3.4.0-dev.3
- Remove code that strips/restores crawl field during validation (now in schema)
- Simplify utils.js logic since config.crawl always defined (defaults to false)
- Update tests to reflect new default behavior (crawl defaults to false)
- All 66 tests passing

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 package-lock.json               |  8 ++++----
 package.json                    |  2 +-
 src/config.js                   | 11 -----------
 src/config.test.js              |  4 ++--
 src/crawler.integration.test.js |  3 ++-
 src/utils.js                    | 18 ++----------------
 6 files changed, 11 insertions(+), 35 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 357fc7b..ee9c42c 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -12,7 +12,7 @@
         "@apidevtools/json-schema-ref-parser": "^14.2.1",
         "ajv": "^8.17.1",
         "axios": "^1.13.1",
-        "doc-detective-common": "3.4.0-dev.2",
+        "doc-detective-common": "3.4.0-dev.3",
         "dotenv": "^17.2.3",
         "json-schema-faker": "^0.5.9",
         "posthog-node": "^5.10.4"
@@ -700,9 +700,9 @@
       }
     },
     "node_modules/doc-detective-common": {
-      "version": "3.4.0-dev.2",
-      "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.2.tgz",
-      "integrity": "sha512-COI9K9jBTwMJmlDLVUksF2WTCKnNlIJhr7pgUUR5VAX6Wt0t6DLfAkbHOUROVFWeBAPW1yVwuLRTQkmNxoR9Dw==",
+      "version": "3.4.0-dev.3",
+      "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.3.tgz",
+      "integrity": "sha512-ZZohuQ7qTxwjXkaehyEJA/5hM5oKQ2J1B6Ue6zTbRl0MJcKtH+iF9X5Vl9aqyEplwPmUabxtzO6m0H4PZKko0g==",
       "license": "AGPL-3.0-only",
       "dependencies": {
         "@apidevtools/json-schema-ref-parser": "^14.2.1",
diff --git a/package.json b/package.json
index 0cf5f87..c5ccb0b 100644
--- a/package.json
+++ b/package.json
@@ -27,7 +27,7 @@
     "@apidevtools/json-schema-ref-parser": "^14.2.1",
     "ajv": "^8.17.1",
     "axios": "^1.13.1",
-    "doc-detective-common": "3.4.0-dev.2",
+    "doc-detective-common": "3.4.0-dev.3",
     "dotenv": "^17.2.3",
     "json-schema-faker": "^0.5.9",
     "posthog-node": "^5.10.4"
diff --git a/src/config.js b/src/config.js
index 2ca814f..72eb211 100644
--- a/src/config.js
+++ b/src/config.js
@@ -478,12 +478,6 @@ async function setConfig({ config }) {
     }
   }
 
-  // Extract non-schema fields before validation
-  const nonSchemaFields = {
-    crawl: config.crawl,
-  };
-  delete config.crawl;
-
   // Validate inbound `config`.
   const validityCheck = validate({ schemaKey: "config_v3", object: config });
   if (!validityCheck.valid) {
@@ -497,11 +491,6 @@ async function setConfig({ config }) {
   }
   config = validityCheck.object;
 
-  // Restore non-schema fields after validation
-  if (nonSchemaFields.crawl !== undefined) {
-    config.crawl = nonSchemaFields.crawl;
-  }
-
   // Replace fileType strings with objects
   config.fileTypes = config.fileTypes.map((fileType) => {
     if (typeof fileType === "object") return fileType;
diff --git a/src/config.test.js b/src/config.test.js
index 610df4b..dc8e283 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -530,7 +530,7 @@ describe("crawl config field", function () {
     expect(result.crawl).to.equal(false);
   });
 
-  it("should handle missing crawl field", async function () {
+  it("should default crawl field to false when not specified", async function () {
     const inputConfig = { 
       input: ["https://example.com"], 
       logLevel: "info",
@@ -539,6 +539,6 @@ describe("crawl config field", function () {
     
     const result = await setConfig({ config: inputConfig });
     
-    expect(result.crawl).to.be.undefined;
+    expect(result.crawl).to.equal(false);
   });
 });
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
index 7d4ab9e..ad885c3 100644
--- a/src/crawler.integration.test.js
+++ b/src/crawler.integration.test.js
@@ -59,9 +59,10 @@ describe("crawler integration", function () {
     sinon.restore();
   });
 
-  it("should process sitemap.xml URLs by default", async function () {
+  it("should process sitemap.xml URLs when crawl is true", async function () {
     const config = {
       input: ["https://example.com/sitemap.xml"],
+      crawl: true,
       logLevel: "info",
       fileTypes: [],
     };
diff --git a/src/utils.js b/src/utils.js
index ab4fa47..9253cdf 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -185,13 +185,6 @@ async function qualifyFiles({ config }) {
   const cleanup = config.afterAll;
   if (cleanup) sequence = sequence.concat(cleanup);
 
-  // Determine if crawling is enabled
-  let shouldCrawl = false;
-  if (config.crawl !== undefined) {
-    // Explicit config setting takes precedence
-    shouldCrawl = config.crawl === true;
-  }
-
   // Collect sitemap.xml URLs that should be crawled
   const sitemapsToProcess = [];
   for (const source of sequence) {
@@ -202,15 +195,8 @@ async function qualifyFiles({ config }) {
     const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
 
     if (isHttpUrl && isSitemapUrl) {
-      // Determine if this specific URL should be crawled
-      let crawlThisUrl = shouldCrawl;
-      
-      // If crawl config is not explicitly set, use protocol-based default
-      if (config.crawl === undefined) {
-        crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default
-      }
-
-      if (crawlThisUrl) {
+      // Check if crawling is enabled (defaults to false in config)
+      if (config.crawl === true) {
         sitemapsToProcess.push(source);
       }
     }

From b11bd5085ab0667ea91083a69e510041306738f3 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 4 Nov 2025 13:47:59 +0000
Subject: [PATCH 8/9] Fix origin matching to use final URL after redirects

- Update crawlSitemap to track final URL after axios redirects
- Use responseUrl from axios response for origin comparison
- Fixes issue where www.doc-detective.com redirects to doc-detective.com
- Now correctly discovers 125 URLs from doc-detective.com sitemap
- All 66 tests passing

Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com>
---
 src/crawler.js | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/crawler.js b/src/crawler.js
index 4549b77..b97fd95 100644
--- a/src/crawler.js
+++ b/src/crawler.js
@@ -73,12 +73,19 @@ async function crawlSitemap({ config, sitemapUrl, log }) {
   
   // Fetch the sitemap content
   let content;
+  let finalUrl = sitemapUrl;
   try {
     const response = await axios.get(sitemapUrl, {
       timeout: 30000,
       maxRedirects: 5,
     });
     content = response.data;
+    
+    // Use the final URL after redirects for origin comparison
+    if (response.request && response.request.res && response.request.res.responseUrl) {
+      finalUrl = response.request.res.responseUrl;
+      logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
+    }
   } catch (error) {
     logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
     return discoveredUrls;
@@ -88,9 +95,9 @@ async function crawlSitemap({ config, sitemapUrl, log }) {
   if (typeof content === "string") {
     const extractedUrls = extractXmlSitemapUrls(content);
     
-    // Filter URLs to only include same-origin URLs
+    // Filter URLs to only include same-origin URLs (using final URL after redirects)
     for (const url of extractedUrls) {
-      if (isSameOrigin(url, sitemapUrl)) {
+      if (isSameOrigin(url, finalUrl)) {
         discoveredUrls.push(url);
       } else {
         logger(config, "debug", `Skipping cross-origin URL: ${url}`);

From 50dac4eb9e8eda86c40af2817affe271e94ae23f Mon Sep 17 00:00:00 2001
From: hawkeyexl <hawkeyexl@gmail.com>
Date: Tue, 4 Nov 2025 13:34:31 -0500
Subject: [PATCH 9/9] Save as HTML

---
 dev/index.js | 3 ++-
 src/utils.js | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/dev/index.js b/dev/index.js
index 847624b..03afb33 100644
--- a/dev/index.js
+++ b/dev/index.js
@@ -12,7 +12,8 @@ main();
  */
 async function main() {
   const json = {
-    input: "dev/doc-content.dita",
+    input: "https://www.doc-detective.com/sitemap.xml",
+    crawl: true,
     logLevel: "debug",
     runOn: [
       {
diff --git a/src/utils.js b/src/utils.js
index 9253cdf..39bcd78 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -154,7 +154,11 @@ async function fetchFile(fileURL) {
     } else {
       response.data = response.data.toString();
     }
-    const fileName = fileURL.split("/").pop();
+    let fileName = fileURL.split("/").pop();
+    // If fileName doesn't have an extension, add ".html"
+    if (!path.extname(fileName)) {
+      fileName += ".html";
+    }
     const hash = crypto.createHash("md5").update(response.data).digest("hex");
     const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`;
     // If doc-detective temp directory doesn't exist, create it