diff --git a/dev/index.js b/dev/index.js
index 847624b..03afb33 100644
--- a/dev/index.js
+++ b/dev/index.js
@@ -12,7 +12,8 @@ main();
  */
 async function main() {
   const json = {
-    input: "dev/doc-content.dita",
+    input: "https://www.doc-detective.com/sitemap.xml",
+    crawl: true,
     logLevel: "debug",
     runOn: [
       {
diff --git a/src/config.test.js b/src/config.test.js
index 3880c5b..dc8e283 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
     expect(result.concurrentRunners).to.equal(4);
   });
 });
+
+describe("crawl config field", function () {
+  it("should preserve crawl field through validation", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: true,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.equal(true);
+  });
+
+  it("should handle crawl field set to false", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      crawl: false,
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.equal(false);
+  });
+
+  it("should default crawl field to false when not specified", async function () {
+    const inputConfig = { 
+      input: ["https://example.com"], 
+      logLevel: "info",
+      fileTypes: ["markdown"]
+    };
+    
+    const result = await setConfig({ config: inputConfig });
+    
+    expect(result.crawl).to.equal(false);
+  });
+});
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
new file mode 100644
index 0000000..ad885c3
--- /dev/null
+++ b/src/crawler.integration.test.js
@@ -0,0 +1,190 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+  const { expect } = await import("chai");
+  global.expect = expect;
+});
+
+describe("crawler integration", function () {
+  let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;
+
+  beforeEach(function () {
+    axiosStub = {
+      get: sinon.stub(),
+    };
+    
+    fsStub = {
+      statSync: sinon.stub(),
+      readdirSync: sinon.stub(),
+      existsSync: sinon.stub(),
+      mkdirSync: sinon.stub(),
+      writeFileSync: sinon.stub(),
+    };
+    
+    crawlSitemapStub = sinon.stub();
+    readFileStub = sinon.stub().resolves({});
+    
+    // Mock fetchFile behavior
+    axiosStub.get.callsFake(async (url) => {
+      if (url.endsWith("sitemap.xml")) {
+        return {
+          data: `<?xml version="1.0" encoding="UTF-8"?>
+            <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+              <url><loc>https://example.com/page1</loc></url>
+              <url><loc>https://example.com/page2</loc></url>
+            </urlset>`,
+        };
+      }
+      return { data: "" };
+    });
+    
+    const utilsModule = proxyquire("./utils", {
+      axios: axiosStub,
+      fs: fsStub,
+      "./crawler": { crawlSitemap: crawlSitemapStub },
+      "doc-detective-common": {
+        validate: () => ({ valid: true }),
+        resolvePaths: (x) => x,
+        transformToSchemaKey: (x) => x,
+        readFile: readFileStub,
+      },
+    });
+    
+    qualifyFiles = utilsModule.qualifyFiles;
+  });
+
+  afterEach(function () {
+    sinon.restore();
+  });
+
+  it("should process sitemap.xml URLs when crawl is true", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlSitemapStub.calledOnce).to.be.true;
+    expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
+  });
+
+  it("should not process non-sitemap URLs", async function () {
+    const config = {
+      input: ["https://example.com/page.html"],
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should disable processing when crawl is false", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: false,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should enable processing when crawl is true", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlSitemapStub.calledOnce).to.be.true;
+  });
+
+  it("should not process file:// URLs", async function () {
+    const config = {
+      input: [],
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    await qualifyFiles({ config });
+    
+    expect(crawlSitemapStub.called).to.be.false;
+  });
+
+  it("should log sitemap processing activity", async function () {
+    const config = {
+      input: ["https://example.com/sitemap.xml"],
+      crawl: true,
+      logLevel: "info",
+      fileTypes: [],
+    };
+    
+    crawlSitemapStub.resolves([
+      "https://example.com/page1",
+      "https://example.com/page2",
+    ]);
+    
+    // Mock file system calls for fetched files
+    fsStub.existsSync.returns(true);
+    fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+    
+    // Capture console output
+    const originalConsoleLog = console.log;
+    const logOutput = [];
+    console.log = (...args) => {
+      logOutput.push(args.join(" "));
+      originalConsoleLog(...args);
+    };
+    
+    try {
+      await qualifyFiles({ config });
+      
+      // Check that processing info was logged
+      const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
+      const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
+      
+      expect(hasProcessingLog).to.be.true;
+      expect(hasDiscoveredLog).to.be.true;
+    } finally {
+      console.log = originalConsoleLog;
+    }
+  });
+});
diff --git a/src/crawler.js b/src/crawler.js
new file mode 100644
index 0000000..b97fd95
--- /dev/null
+++ b/src/crawler.js
@@ -0,0 +1,111 @@
+const axios = require("axios");
+
+exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
+exports.isSameOrigin = isSameOrigin;
+exports.crawlSitemap = crawlSitemap;
+
+/**
+ * Extracts URLs from XML sitemap.
+ * 
+ * @param {string} xml - The XML sitemap content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractXmlSitemapUrls(xml) {
+  if (typeof xml !== "string") {
+    return [];
+  }
+  
+  const urls = [];
+  // Match <loc> tags in XML sitemaps
+  const locRegex = /<loc>([^<]+)<\/loc>/gi;
+  let match;
+  
+  while ((match = locRegex.exec(xml)) !== null) {
+    const url = match[1].trim();
+    if (url) {
+      urls.push(url);
+    }
+  }
+  
+  return urls;
+}
+
+/**
+ * Compares two URLs for strict origin matching.
+ * 
+ * @param {string} url1 - First URL to compare
+ * @param {string} url2 - Second URL to compare
+ * @returns {boolean} - True if origins match strictly (protocol, hostname, and port)
+ */
+function isSameOrigin(url1, url2) {
+  try {
+    const parsed1 = new URL(url1);
+    const parsed2 = new URL(url2);
+    
+    // Compare protocol, hostname, and port
+    return (
+      parsed1.protocol === parsed2.protocol &&
+      parsed1.hostname === parsed2.hostname &&
+      parsed1.port === parsed2.port
+    );
+  } catch (error) {
+    // If URL parsing fails, they can't be same origin
+    return false;
+  }
+}
+
+/**
+ * Processes an XML sitemap and extracts all URLs.
+ * 
+ * @param {Object} options - Crawling options
+ * @param {Object} options.config - Configuration object
+ * @param {string} options.sitemapUrl - URL of the sitemap to process
+ * @param {Function} options.log - Logging function (optional)
+ * @returns {Promise<string[]>} - Promise resolving to array of all discovered URLs
+ */
+async function crawlSitemap({ config, sitemapUrl, log }) {
+  // Default no-op logger if not provided
+  const logger = log || (() => {});
+  
+  const discoveredUrls = [];
+  
+  logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);
+  
+  // Fetch the sitemap content
+  let content;
+  let finalUrl = sitemapUrl;
+  try {
+    const response = await axios.get(sitemapUrl, {
+      timeout: 30000,
+      maxRedirects: 5,
+    });
+    content = response.data;
+    
+    // Use the final URL after redirects for origin comparison
+    if (response.request && response.request.res && response.request.res.responseUrl) {
+      finalUrl = response.request.res.responseUrl;
+      logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
+    }
+  } catch (error) {
+    logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
+    return discoveredUrls;
+  }
+  
+  // Extract URLs from sitemap
+  if (typeof content === "string") {
+    const extractedUrls = extractXmlSitemapUrls(content);
+    
+    // Filter URLs to only include same-origin URLs (using final URL after redirects)
+    for (const url of extractedUrls) {
+      if (isSameOrigin(url, finalUrl)) {
+        discoveredUrls.push(url);
+      } else {
+        logger(config, "debug", `Skipping cross-origin URL: ${url}`);
+      }
+    }
+  }
+  
+  logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);
+  
+  return discoveredUrls;
+}
diff --git a/src/crawler.test.js b/src/crawler.test.js
new file mode 100644
index 0000000..54a07e8
--- /dev/null
+++ b/src/crawler.test.js
@@ -0,0 +1,240 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+  const { expect } = await import("chai");
+  global.expect = expect;
+});
+
+describe("crawler", function () {
+  describe("extractXmlSitemapUrls", function () {
+    let extractXmlSitemapUrls;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      extractXmlSitemapUrls = crawler.extractXmlSitemapUrls;
+    });
+
+    it("should extract single URL from XML sitemap", function () {
+      const xml = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url>
+            <loc>https://example.com/page1</loc>
+          </url>
+        </urlset>`;
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should extract multiple URLs from XML sitemap", function () {
+      const xml = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url>
+            <loc>https://example.com/page1</loc>
+          </url>
+          <url>
+            <loc>https://example.com/page2</loc>
+          </url>
+          <url>
+            <loc>https://example.com/page3</loc>
+          </url>
+        </urlset>`;
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+        "https://example.com/page3",
+      ]);
+    });
+
+    it("should handle empty string", function () {
+      const urls = extractXmlSitemapUrls("");
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle non-string input", function () {
+      const urls = extractXmlSitemapUrls(null);
+      expect(urls).to.deep.equal([]);
+    });
+
+    it("should handle XML without loc tags", function () {
+      const xml = "<?xml version=\"1.0\"?><root><item>test</item></root>";
+      const urls = extractXmlSitemapUrls(xml);
+      expect(urls).to.deep.equal([]);
+    });
+  });
+
+  describe("isSameOrigin", function () {
+    let isSameOrigin;
+
+    beforeEach(function () {
+      const crawler = require("./crawler");
+      isSameOrigin = crawler.isSameOrigin;
+    });
+
+    it("should return true for same protocol, domain, and port", function () {
+      const result = isSameOrigin(
+        "https://example.com:443/page1",
+        "https://example.com:443/page2"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should return true for same origin with default ports", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://example.com/page2"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should return false for different protocol", function () {
+      const result = isSameOrigin(
+        "http://example.com/page1",
+        "https://example.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for different domain", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://other.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for different port", function () {
+      const result = isSameOrigin(
+        "https://example.com:443/page1",
+        "https://example.com:8080/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for subdomain differences", function () {
+      const result = isSameOrigin(
+        "https://example.com/page1",
+        "https://subdomain.example.com/page2"
+      );
+      expect(result).to.be.false;
+    });
+
+    it("should return false for malformed URLs", function () {
+      const result = isSameOrigin("not a url", "https://example.com");
+      expect(result).to.be.false;
+    });
+
+    it("should handle query parameters", function () {
+      const result = isSameOrigin(
+        "https://example.com/page?foo=bar",
+        "https://example.com/page?baz=qux"
+      );
+      expect(result).to.be.true;
+    });
+
+    it("should handle fragments", function () {
+      const result = isSameOrigin(
+        "https://example.com/page#section1",
+        "https://example.com/page#section2"
+      );
+      expect(result).to.be.true;
+    });
+  });
+
+  describe("crawlSitemap", function () {
+    let crawlSitemap, axiosStub, logStub;
+
+    beforeEach(function () {
+      axiosStub = {
+        get: sinon.stub(),
+      };
+      logStub = sinon.stub();
+
+      const crawlerModule = proxyquire("./crawler", {
+        axios: axiosStub,
+      });
+      crawlSitemap = crawlerModule.crawlSitemap;
+    });
+
+    afterEach(function () {
+      sinon.restore();
+    });
+
+    it("should process sitemap and extract same-origin URLs", async function () {
+      const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      const sitemapContent = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/page1</loc></url>
+          <url><loc>https://example.com/page2</loc></url>
+        </urlset>`;
+      
+      axiosStub.get.resolves({ data: sitemapContent });
+
+      const urls = await crawlSitemap({
+        config,
+        sitemapUrl,
+        log: logStub,
+      });
+
+      expect(urls).to.deep.equal([
+        "https://example.com/page1",
+        "https://example.com/page2",
+      ]);
+      expect(axiosStub.get.calledOnce).to.be.true;
+    });
+
+    it("should filter out cross-origin URLs", async function () {
+      const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      const sitemapContent = `<?xml version="1.0" encoding="UTF-8"?>
+        <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+          <url><loc>https://example.com/page1</loc></url>
+          <url><loc>https://other.com/page2</loc></url>
+        </urlset>`;
+      
+      axiosStub.get.resolves({ data: sitemapContent });
+
+      const urls = await crawlSitemap({
+        config,
+        sitemapUrl,
+        log: logStub,
+      });
+
+      expect(urls).to.deep.equal(["https://example.com/page1"]);
+    });
+
+    it("should handle fetch errors gracefully", async function () {
+      const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      
+      axiosStub.get.rejects(new Error("404 Not Found"));
+
+      const urls = await crawlSitemap({
+        config,
+        sitemapUrl,
+        log: logStub,
+      });
+
+      expect(urls).to.deep.equal([]);
+      expect(logStub.calledWith(config, "warn")).to.be.true;
+    });
+
+    it("should handle non-string content", async function () {
+      const config = { logLevel: "info" };
+      const sitemapUrl = "https://example.com/sitemap.xml";
+      
+      axiosStub.get.resolves({ data: { json: "object" } });
+
+      const urls = await crawlSitemap({
+        config,
+        sitemapUrl,
+        log: logStub,
+      });
+
+      expect(urls).to.deep.equal([]);
+    });
+  });
+});
diff --git a/src/utils.js b/src/utils.js
index 8b0a61c..39bcd78 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,6 +11,7 @@ const {
   transformToSchemaKey,
   readFile,
 } = require("doc-detective-common");
+const { crawlSitemap } = require("./crawler");
 
 exports.qualifyFiles = qualifyFiles;
 exports.parseTests = parseTests;
@@ -153,7 +154,11 @@ async function fetchFile(fileURL) {
     } else {
       response.data = response.data.toString();
     }
-    const fileName = fileURL.split("/").pop();
+    let fileName = fileURL.split("/").pop();
+    // If fileName doesn't have an extension, add ".html"
+    if (!path.extname(fileName)) {
+      fileName += ".html";
+    }
     const hash = crypto.createHash("md5").update(response.data).digest("hex");
     const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`;
     // If doc-detective temp directory doesn't exist, create it
@@ -184,6 +189,52 @@ async function qualifyFiles({ config }) {
   const cleanup = config.afterAll;
   if (cleanup) sequence = sequence.concat(cleanup);
 
+  // Collect sitemap.xml URLs that should be crawled
+  const sitemapsToProcess = [];
+  for (const source of sequence) {
+    const isHttpUrl =
+      typeof source === "string" &&
+      (source.startsWith("http://") || source.startsWith("https://"));
+    
+    const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
+
+    if (isHttpUrl && isSitemapUrl) {
+      // Check if crawling is enabled (defaults to false in config)
+      if (config.crawl === true) {
+        sitemapsToProcess.push(source);
+      }
+    }
+  }
+
+  // Process sitemaps if there are any to crawl
+  if (sitemapsToProcess.length > 0) {
+    log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`);
+    try {
+      const allDiscoveredUrls = [];
+      
+      // Process each sitemap
+      for (const sitemapUrl of sitemapsToProcess) {
+        const discoveredUrls = await crawlSitemap({
+          config,
+          sitemapUrl,
+          log,
+        });
+        allDiscoveredUrls.push(...discoveredUrls);
+      }
+      
+      // Add newly discovered URLs to the sequence
+      // Filter out URLs that were already in the initial sequence
+      const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url));
+      log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`);
+      
+      // Add new URLs after the input section but before cleanup
+      const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
+      sequence.splice(cleanupStartIndex, 0, ...newUrls);
+    } catch (error) {
+      log(config, "error", `Sitemap processing failed: ${error.message}`);
+    }
+  }
+
   for (let source of sequence) {
     log(config, "debug", `source: ${source}`);
     // Check if source is a URL