diff --git a/dev/index.js b/dev/index.js
index 847624b..03afb33 100644
--- a/dev/index.js
+++ b/dev/index.js
@@ -12,7 +12,8 @@ main();
*/
async function main() {
const json = {
- input: "dev/doc-content.dita",
+ input: "https://www.doc-detective.com/sitemap.xml",
+ crawl: true,
logLevel: "debug",
runOn: [
{
diff --git a/src/config.test.js b/src/config.test.js
index 3880c5b..dc8e283 100644
--- a/src/config.test.js
+++ b/src/config.test.js
@@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () {
expect(result.concurrentRunners).to.equal(4);
});
});
+
+describe("crawl config field", function () {
+ it("should preserve crawl field through validation", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.equal(true);
+ });
+
+ it("should handle crawl field set to false", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ crawl: false,
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.equal(false);
+ });
+
+ it("should default crawl field to false when not specified", async function () {
+ const inputConfig = {
+ input: ["https://example.com"],
+ logLevel: "info",
+ fileTypes: ["markdown"]
+ };
+
+ const result = await setConfig({ config: inputConfig });
+
+ expect(result.crawl).to.equal(false);
+ });
+});
diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js
new file mode 100644
index 0000000..ad885c3
--- /dev/null
+++ b/src/crawler.integration.test.js
@@ -0,0 +1,190 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+ const { expect } = await import("chai");
+ global.expect = expect;
+});
+
+describe("crawler integration", function () {
+ let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub;
+
+ beforeEach(function () {
+ axiosStub = {
+ get: sinon.stub(),
+ };
+
+ fsStub = {
+ statSync: sinon.stub(),
+ readdirSync: sinon.stub(),
+ existsSync: sinon.stub(),
+ mkdirSync: sinon.stub(),
+ writeFileSync: sinon.stub(),
+ };
+
+ crawlSitemapStub = sinon.stub();
+ readFileStub = sinon.stub().resolves({});
+
+ // Mock fetchFile behavior
+ axiosStub.get.callsFake(async (url) => {
+ if (url.endsWith("sitemap.xml")) {
+ return {
+ data: `
+
+ https://example.com/page1
+ https://example.com/page2
+ `,
+ };
+ }
+ return { data: "" };
+ });
+
+ const utilsModule = proxyquire("./utils", {
+ axios: axiosStub,
+ fs: fsStub,
+ "./crawler": { crawlSitemap: crawlSitemapStub },
+ "doc-detective-common": {
+ validate: () => ({ valid: true }),
+ resolvePaths: (x) => x,
+ transformToSchemaKey: (x) => x,
+ readFile: readFileStub,
+ },
+ });
+
+ qualifyFiles = utilsModule.qualifyFiles;
+ });
+
+ afterEach(function () {
+ sinon.restore();
+ });
+
+ it("should process sitemap.xml URLs when crawl is true", async function () {
+ const config = {
+ input: ["https://example.com/sitemap.xml"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlSitemapStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlSitemapStub.calledOnce).to.be.true;
+ expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml");
+ });
+
+ it("should not process non-sitemap URLs", async function () {
+ const config = {
+ input: ["https://example.com/page.html"],
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlSitemapStub.called).to.be.false;
+ });
+
+ it("should disable processing when crawl is false", async function () {
+ const config = {
+ input: ["https://example.com/sitemap.xml"],
+ crawl: false,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlSitemapStub.called).to.be.false;
+ });
+
+ it("should enable processing when crawl is true", async function () {
+ const config = {
+ input: ["https://example.com/sitemap.xml"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlSitemapStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ await qualifyFiles({ config });
+
+ expect(crawlSitemapStub.calledOnce).to.be.true;
+ });
+
+ it("should not process file:// URLs", async function () {
+ const config = {
+ input: [],
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ await qualifyFiles({ config });
+
+ expect(crawlSitemapStub.called).to.be.false;
+ });
+
+ it("should log sitemap processing activity", async function () {
+ const config = {
+ input: ["https://example.com/sitemap.xml"],
+ crawl: true,
+ logLevel: "info",
+ fileTypes: [],
+ };
+
+ crawlSitemapStub.resolves([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+
+ // Mock file system calls for fetched files
+ fsStub.existsSync.returns(true);
+ fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false });
+
+ // Capture console output
+ const originalConsoleLog = console.log;
+ const logOutput = [];
+ console.log = (...args) => {
+ logOutput.push(args.join(" "));
+ originalConsoleLog(...args);
+ };
+
+ try {
+ await qualifyFiles({ config });
+
+ // Check that processing info was logged
+ const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap"));
+ const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered"));
+
+ expect(hasProcessingLog).to.be.true;
+ expect(hasDiscoveredLog).to.be.true;
+ } finally {
+ console.log = originalConsoleLog;
+ }
+ });
+});
diff --git a/src/crawler.js b/src/crawler.js
new file mode 100644
index 0000000..b97fd95
--- /dev/null
+++ b/src/crawler.js
@@ -0,0 +1,111 @@
+const axios = require("axios");
+
+exports.extractXmlSitemapUrls = extractXmlSitemapUrls;
+exports.isSameOrigin = isSameOrigin;
+exports.crawlSitemap = crawlSitemap;
+
+/**
+ * Extracts URLs from XML sitemap.
+ *
+ * @param {string} xml - The XML sitemap content to parse
+ * @returns {string[]} - Array of extracted URLs
+ */
+function extractXmlSitemapUrls(xml) {
+ if (typeof xml !== "string") {
+ return [];
+ }
+
+ const urls = [];
+ // Match tags in XML sitemaps
+ const locRegex = /([^<]+)<\/loc>/gi;
+ let match;
+
+ while ((match = locRegex.exec(xml)) !== null) {
+ const url = match[1].trim();
+ if (url) {
+ urls.push(url);
+ }
+ }
+
+ return urls;
+}
+
+/**
+ * Compares two URLs for strict origin matching.
+ *
+ * @param {string} url1 - First URL to compare
+ * @param {string} url2 - Second URL to compare
+ * @returns {boolean} - True if origins match strictly (protocol, hostname, and port)
+ */
+function isSameOrigin(url1, url2) {
+ try {
+ const parsed1 = new URL(url1);
+ const parsed2 = new URL(url2);
+
+ // Compare protocol, hostname, and port
+ return (
+ parsed1.protocol === parsed2.protocol &&
+ parsed1.hostname === parsed2.hostname &&
+ parsed1.port === parsed2.port
+ );
+ } catch (error) {
+ // If URL parsing fails, they can't be same origin
+ return false;
+ }
+}
+
+/**
+ * Processes an XML sitemap and extracts all URLs.
+ *
+ * @param {Object} options - Crawling options
+ * @param {Object} options.config - Configuration object
+ * @param {string} options.sitemapUrl - URL of the sitemap to process
+ * @param {Function} options.log - Logging function (optional)
+ * @returns {Promise} - Promise resolving to array of all discovered URLs
+ */
+async function crawlSitemap({ config, sitemapUrl, log }) {
+ // Default no-op logger if not provided
+ const logger = log || (() => {});
+
+ const discoveredUrls = [];
+
+ logger(config, "debug", `Processing sitemap: ${sitemapUrl}`);
+
+ // Fetch the sitemap content
+ let content;
+ let finalUrl = sitemapUrl;
+ try {
+ const response = await axios.get(sitemapUrl, {
+ timeout: 30000,
+ maxRedirects: 5,
+ });
+ content = response.data;
+
+ // Use the final URL after redirects for origin comparison
+ if (response.request && response.request.res && response.request.res.responseUrl) {
+ finalUrl = response.request.res.responseUrl;
+ logger(config, "debug", `Sitemap redirected to: ${finalUrl}`);
+ }
+ } catch (error) {
+ logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`);
+ return discoveredUrls;
+ }
+
+ // Extract URLs from sitemap
+ if (typeof content === "string") {
+ const extractedUrls = extractXmlSitemapUrls(content);
+
+ // Filter URLs to only include same-origin URLs (using final URL after redirects)
+ for (const url of extractedUrls) {
+ if (isSameOrigin(url, finalUrl)) {
+ discoveredUrls.push(url);
+ } else {
+ logger(config, "debug", `Skipping cross-origin URL: ${url}`);
+ }
+ }
+ }
+
+ logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`);
+
+ return discoveredUrls;
+}
diff --git a/src/crawler.test.js b/src/crawler.test.js
new file mode 100644
index 0000000..54a07e8
--- /dev/null
+++ b/src/crawler.test.js
@@ -0,0 +1,240 @@
+const assert = require("assert");
+const sinon = require("sinon");
+const proxyquire = require("proxyquire");
+
+before(async function () {
+ const { expect } = await import("chai");
+ global.expect = expect;
+});
+
+describe("crawler", function () {
+ describe("extractXmlSitemapUrls", function () {
+ let extractXmlSitemapUrls;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ extractXmlSitemapUrls = crawler.extractXmlSitemapUrls;
+ });
+
+ it("should extract single URL from XML sitemap", function () {
+ const xml = `
+
+
+ https://example.com/page1
+
+ `;
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should extract multiple URLs from XML sitemap", function () {
+ const xml = `
+
+
+ https://example.com/page1
+
+
+ https://example.com/page2
+
+
+ https://example.com/page3
+
+ `;
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ "https://example.com/page3",
+ ]);
+ });
+
+ it("should handle empty string", function () {
+ const urls = extractXmlSitemapUrls("");
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle non-string input", function () {
+ const urls = extractXmlSitemapUrls(null);
+ expect(urls).to.deep.equal([]);
+ });
+
+ it("should handle XML without loc tags", function () {
+ const xml = "- test
";
+ const urls = extractXmlSitemapUrls(xml);
+ expect(urls).to.deep.equal([]);
+ });
+ });
+
+ describe("isSameOrigin", function () {
+ let isSameOrigin;
+
+ beforeEach(function () {
+ const crawler = require("./crawler");
+ isSameOrigin = crawler.isSameOrigin;
+ });
+
+ it("should return true for same protocol, domain, and port", function () {
+ const result = isSameOrigin(
+ "https://example.com:443/page1",
+ "https://example.com:443/page2"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should return true for same origin with default ports", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://example.com/page2"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should return false for different protocol", function () {
+ const result = isSameOrigin(
+ "http://example.com/page1",
+ "https://example.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for different domain", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://other.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for different port", function () {
+ const result = isSameOrigin(
+ "https://example.com:443/page1",
+ "https://example.com:8080/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for subdomain differences", function () {
+ const result = isSameOrigin(
+ "https://example.com/page1",
+ "https://subdomain.example.com/page2"
+ );
+ expect(result).to.be.false;
+ });
+
+ it("should return false for malformed URLs", function () {
+ const result = isSameOrigin("not a url", "https://example.com");
+ expect(result).to.be.false;
+ });
+
+ it("should handle query parameters", function () {
+ const result = isSameOrigin(
+ "https://example.com/page?foo=bar",
+ "https://example.com/page?baz=qux"
+ );
+ expect(result).to.be.true;
+ });
+
+ it("should handle fragments", function () {
+ const result = isSameOrigin(
+ "https://example.com/page#section1",
+ "https://example.com/page#section2"
+ );
+ expect(result).to.be.true;
+ });
+ });
+
+ describe("crawlSitemap", function () {
+ let crawlSitemap, axiosStub, logStub;
+
+ beforeEach(function () {
+ axiosStub = {
+ get: sinon.stub(),
+ };
+ logStub = sinon.stub();
+
+ const crawlerModule = proxyquire("./crawler", {
+ axios: axiosStub,
+ });
+ crawlSitemap = crawlerModule.crawlSitemap;
+ });
+
+ afterEach(function () {
+ sinon.restore();
+ });
+
+ it("should process sitemap and extract same-origin URLs", async function () {
+ const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+ const sitemapContent = `
+
+ https://example.com/page1
+ https://example.com/page2
+ `;
+
+ axiosStub.get.resolves({ data: sitemapContent });
+
+ const urls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log: logStub,
+ });
+
+ expect(urls).to.deep.equal([
+ "https://example.com/page1",
+ "https://example.com/page2",
+ ]);
+ expect(axiosStub.get.calledOnce).to.be.true;
+ });
+
+ it("should filter out cross-origin URLs", async function () {
+ const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+ const sitemapContent = `
+
+ https://example.com/page1
+ https://other.com/page2
+ `;
+
+ axiosStub.get.resolves({ data: sitemapContent });
+
+ const urls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log: logStub,
+ });
+
+ expect(urls).to.deep.equal(["https://example.com/page1"]);
+ });
+
+ it("should handle fetch errors gracefully", async function () {
+ const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+
+ axiosStub.get.rejects(new Error("404 Not Found"));
+
+ const urls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log: logStub,
+ });
+
+ expect(urls).to.deep.equal([]);
+ expect(logStub.calledWith(config, "warn")).to.be.true;
+ });
+
+ it("should handle non-string content", async function () {
+ const config = { logLevel: "info" };
+ const sitemapUrl = "https://example.com/sitemap.xml";
+
+ axiosStub.get.resolves({ data: { json: "object" } });
+
+ const urls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log: logStub,
+ });
+
+ expect(urls).to.deep.equal([]);
+ });
+ });
+});
diff --git a/src/utils.js b/src/utils.js
index 8b0a61c..39bcd78 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -11,6 +11,7 @@ const {
transformToSchemaKey,
readFile,
} = require("doc-detective-common");
+const { crawlSitemap } = require("./crawler");
exports.qualifyFiles = qualifyFiles;
exports.parseTests = parseTests;
@@ -153,7 +154,11 @@ async function fetchFile(fileURL) {
} else {
response.data = response.data.toString();
}
- const fileName = fileURL.split("/").pop();
+ let fileName = fileURL.split("/").pop();
+ // If fileName doesn't have an extension, add ".html"
+ if (!path.extname(fileName)) {
+ fileName += ".html";
+ }
const hash = crypto.createHash("md5").update(response.data).digest("hex");
const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`;
// If doc-detective temp directory doesn't exist, create it
@@ -184,6 +189,52 @@ async function qualifyFiles({ config }) {
const cleanup = config.afterAll;
if (cleanup) sequence = sequence.concat(cleanup);
+ // Collect sitemap.xml URLs that should be crawled
+ const sitemapsToProcess = [];
+ for (const source of sequence) {
+ const isHttpUrl =
+ typeof source === "string" &&
+ (source.startsWith("http://") || source.startsWith("https://"));
+
+ const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml");
+
+ if (isHttpUrl && isSitemapUrl) {
+ // Check if crawling is enabled (defaults to false in config)
+ if (config.crawl === true) {
+ sitemapsToProcess.push(source);
+ }
+ }
+ }
+
+ // Process sitemaps if there are any to crawl
+ if (sitemapsToProcess.length > 0) {
+ log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`);
+ try {
+ const allDiscoveredUrls = [];
+
+ // Process each sitemap
+ for (const sitemapUrl of sitemapsToProcess) {
+ const discoveredUrls = await crawlSitemap({
+ config,
+ sitemapUrl,
+ log,
+ });
+ allDiscoveredUrls.push(...discoveredUrls);
+ }
+
+ // Add newly discovered URLs to the sequence
+ // Filter out URLs that were already in the initial sequence
+ const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url));
+ log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`);
+
+ // Add new URLs after the input section but before cleanup
+ const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length;
+ sequence.splice(cleanupStartIndex, 0, ...newUrls);
+ } catch (error) {
+ log(config, "error", `Sitemap processing failed: ${error.message}`);
+ }
+ }
+
for (let source of sequence) {
log(config, "debug", `source: ${source}`);
// Check if source is a URL