From 4c351465b3910948a5f2e27e69dfe8a59d3c25c4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:11:05 +0000 Subject: [PATCH 1/9] Initial plan From e0477aba2ecb9f398e4e5ba2fba2af86774edd12 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:20:22 +0000 Subject: [PATCH 2/9] Add URL crawling functionality with comprehensive tests Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/crawler.integration.test.js | 200 +++++++++++++ src/crawler.js | 209 ++++++++++++++ src/crawler.test.js | 491 ++++++++++++++++++++++++++++++++ src/utils.js | 52 ++++ 4 files changed, 952 insertions(+) create mode 100644 src/crawler.integration.test.js create mode 100644 src/crawler.js create mode 100644 src/crawler.test.js diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js new file mode 100644 index 0000000..d6f8797 --- /dev/null +++ b/src/crawler.integration.test.js @@ -0,0 +1,200 @@ +const assert = require("assert"); +const sinon = require("sinon"); +const proxyquire = require("proxyquire"); + +before(async function () { + const { expect } = await import("chai"); + global.expect = expect; +}); + +describe("crawler integration", function () { + let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub; + + beforeEach(function () { + axiosStub = { + get: sinon.stub(), + }; + + fsStub = { + statSync: sinon.stub(), + readdirSync: sinon.stub(), + existsSync: sinon.stub(), + mkdirSync: sinon.stub(), + writeFileSync: sinon.stub(), + }; + + logStub = sinon.stub(); + crawlUrlsStub = sinon.stub(); + readFileStub = sinon.stub().resolves({}); + + // Mock fetchFile behavior + axiosStub.get.callsFake(async (url) => { + if (url === "https://example.com/page1") { + return { + data: 'Link', + }; + } else if (url === "https://example.com/page2") { + return { data: "Content" }; + } + return { data: "" }; + }); + + const utilsModule = proxyquire("./utils", { + axios: axiosStub, + fs: fsStub, + "./crawler": { crawlUrls: crawlUrlsStub }, + "doc-detective-common": { + validate: () => ({ valid: true }), + resolvePaths: (x) => x, + transformToSchemaKey: (x) => x, + readFile: readFileStub, + }, + }); + + qualifyFiles = utilsModule.qualifyFiles; + }); + + afterEach(function () { + sinon.restore(); + }); + + it("should enable crawling by default for HTTP URLs", async function () { + const config = { + input: ["https://example.com/page1"], + logLevel: "info", + fileTypes: [], + }; + + crawlUrlsStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlUrlsStub.calledOnce).to.be.true; + expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([ + "https://example.com/page1", + ]); + }); + + it("should disable crawling when crawl is false", async function () { + const config = { + input: ["https://example.com/page1"], + crawl: false, + logLevel: "info", + fileTypes: [], + }; + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlUrlsStub.called).to.be.false; + }); + + it("should enable crawling when crawl is true", async function () { + const config = { + input: ["https://example.com/page1"], + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlUrlsStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlUrlsStub.calledOnce).to.be.true; + }); + + it("should not crawl file:// URLs by default", async function () { + const config = { + input: [], // Empty input to avoid processing issues + logLevel: "info", + fileTypes: [], + }; + + // file:// URLs won't trigger crawling since they don't start with http:// or https:// + // This test just verifies no crawling happens + + await qualifyFiles({ config }); + + expect(crawlUrlsStub.called).to.be.false; + }); + + it("should pass origin config to crawler", async function () { + const config = { + input: ["https://example.com/page1"], + origin: "https://example.com", + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlUrlsStub.resolves(["https://example.com/page1"]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + await qualifyFiles({ config }); + + expect(crawlUrlsStub.calledOnce).to.be.true; + expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal( + "https://example.com" + ); + }); + + it("should log crawling activity", async function () { + const config = { + input: ["https://example.com/page1"], + crawl: true, + logLevel: "info", + fileTypes: [], + }; + + crawlUrlsStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); + + // Capture console output + const originalConsoleLog = console.log; + const logOutput = []; + console.log = (...args) => { + logOutput.push(args.join(" ")); + originalConsoleLog(...args); + }; + + try { + await qualifyFiles({ config }); + + // Check that crawling info was logged + const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling")); + const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered")); + + expect(hasCrawlingLog).to.be.true; + expect(hasDiscoveredLog).to.be.true; + } finally { + console.log = originalConsoleLog; + } + }); +}); diff --git a/src/crawler.js b/src/crawler.js new file mode 100644 index 0000000..62c2ed5 --- /dev/null +++ b/src/crawler.js @@ -0,0 +1,209 @@ +const axios = require("axios"); +const { log } = require("./utils"); + +exports.extractHtmlUrls = extractHtmlUrls; +exports.extractMarkdownUrls = extractMarkdownUrls; +exports.isSameOrigin = isSameOrigin; +exports.resolveRelativeUrl = resolveRelativeUrl; +exports.crawlUrls = crawlUrls; + +/** + * Extracts URLs from HTML tags with href attributes. + * + * @param {string} html - The HTML content to parse + * @returns {string[]} - Array of extracted URLs + */ +function extractHtmlUrls(html) { + if (typeof html !== "string") { + return []; + } + + const urls = []; + // Match tags with href attributes + // This regex handles various formats: href="url", href='url', href=url + const anchorRegex = /]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi; + let match; + + while ((match = anchorRegex.exec(html)) !== null) { + const url = match[1]; + if (url && url !== "#" && !url.startsWith("javascript:")) { + urls.push(url); + } + } + + return urls; +} + +/** + * Extracts URLs from Markdown [text](url) syntax. + * + * @param {string} markdown - The Markdown content to parse + * @returns {string[]} - Array of extracted URLs + */ +function extractMarkdownUrls(markdown) { + if (typeof markdown !== "string") { + return []; + } + + const urls = []; + // Match [text](url) syntax, handling escaped brackets + // This regex avoids matching image syntax ![text](url) + const linkRegex = /(?} - Promise resolving to array of all discovered URLs + */ +async function crawlUrls({ config, initialUrls }) { + const visitedUrls = new Set(); + const discoveredUrls = []; + const MAX_URLS = 10000; + let urlQueue = [...initialUrls]; + + // Process each URL in the queue + while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) { + const currentUrl = urlQueue.shift(); + + // Skip if already visited + if (visitedUrls.has(currentUrl)) { + continue; + } + + visitedUrls.add(currentUrl); + discoveredUrls.push(currentUrl); + + log(config, "debug", `Crawling: ${currentUrl}`); + + // Fetch the URL content + let content; + try { + const response = await axios.get(currentUrl, { + timeout: 30000, + maxRedirects: 5, + }); + content = response.data; + } catch (error) { + log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`); + continue; + } + + // Extract URLs based on content type + let extractedUrls = []; + if (typeof content === "string") { + // Try both HTML and Markdown extraction + extractedUrls = [ + ...extractHtmlUrls(content), + ...extractMarkdownUrls(content), + ]; + } + + // Process extracted URLs + for (const url of extractedUrls) { + let absoluteUrl; + + // Check if URL is relative + try { + new URL(url); + absoluteUrl = url; + } catch { + // It's relative + if (config.origin) { + absoluteUrl = resolveRelativeUrl(url, config.origin); + if (!absoluteUrl) { + continue; // Skip malformed URLs + } + } else { + // No origin configured, skip relative URLs + log( + config, + "debug", + `Skipping relative URL (no origin configured): ${url}` + ); + continue; + } + } + + // Check if same origin as current URL + if (isSameOrigin(absoluteUrl, currentUrl)) { + if (!visitedUrls.has(absoluteUrl)) { + urlQueue.push(absoluteUrl); + } + } + } + } + + // Log warning if limit reached + if (discoveredUrls.length >= MAX_URLS) { + log( + config, + "warn", + `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs` + ); + } + + return discoveredUrls; +} diff --git a/src/crawler.test.js b/src/crawler.test.js new file mode 100644 index 0000000..0b17ca5 --- /dev/null +++ b/src/crawler.test.js @@ -0,0 +1,491 @@ +const assert = require("assert"); +const sinon = require("sinon"); +const proxyquire = require("proxyquire"); + +before(async function () { + const { expect } = await import("chai"); + global.expect = expect; +}); + +describe("crawler", function () { + describe("extractHtmlUrls", function () { + let extractHtmlUrls; + + beforeEach(function () { + const crawler = require("./crawler"); + extractHtmlUrls = crawler.extractHtmlUrls; + }); + + it("should extract single URL from HTML", function () { + const html = 'Link'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should extract multiple URLs from HTML", function () { + const html = ` + Link 1 + Link 2 + Link 3 + `; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ]); + }); + + it("should handle single and double quotes", function () { + const html = ` + Link 1 + Link 2 + `; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + }); + + it("should ignore anchor links", function () { + const html = 'AnchorLink'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["https://example.com"]); + }); + + it("should ignore javascript: links", function () { + const html = 'JS LinkLink'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["https://example.com"]); + }); + + it("should handle empty string", function () { + const urls = extractHtmlUrls(""); + expect(urls).to.deep.equal([]); + }); + + it("should handle non-string input", function () { + const urls = extractHtmlUrls(null); + expect(urls).to.deep.equal([]); + }); + + it("should extract relative URLs", function () { + const html = 'RelativeAbsolute'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["/page1", "https://example.com"]); + }); + }); + + describe("extractMarkdownUrls", function () { + let extractMarkdownUrls; + + beforeEach(function () { + const crawler = require("./crawler"); + extractMarkdownUrls = crawler.extractMarkdownUrls; + }); + + it("should extract single URL from Markdown", function () { + const markdown = "[Link](https://example.com/page1)"; + const urls = extractMarkdownUrls(markdown); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should extract multiple URLs from Markdown", function () { + const markdown = ` + [Link 1](https://example.com/page1) + [Link 2](https://example.com/page2) + [Link 3](https://example.com/page3) + `; + const urls = extractMarkdownUrls(markdown); + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ]); + }); + + it("should ignore image syntax", function () { + const markdown = "![Image](https://example.com/image.png) [Link](https://example.com/page1)"; + const urls = extractMarkdownUrls(markdown); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should handle URLs with title text", function () { + const markdown = '[Link](https://example.com/page1 "Title text")'; + const urls = extractMarkdownUrls(markdown); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should handle empty string", function () { + const urls = extractMarkdownUrls(""); + expect(urls).to.deep.equal([]); + }); + + it("should handle non-string input", function () { + const urls = extractMarkdownUrls(null); + expect(urls).to.deep.equal([]); + }); + + it("should extract relative URLs", function () { + const markdown = "[Relative](/page1) [Absolute](https://example.com)"; + const urls = extractMarkdownUrls(markdown); + expect(urls).to.deep.equal(["/page1", "https://example.com"]); + }); + }); + + describe("isSameOrigin", function () { + let isSameOrigin; + + beforeEach(function () { + const crawler = require("./crawler"); + isSameOrigin = crawler.isSameOrigin; + }); + + it("should return true for same protocol, domain, and port", function () { + const result = isSameOrigin( + "https://example.com:443/page1", + "https://example.com:443/page2" + ); + expect(result).to.be.true; + }); + + it("should return true for same origin with default ports", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://example.com/page2" + ); + expect(result).to.be.true; + }); + + it("should return false for different protocol", function () { + const result = isSameOrigin( + "http://example.com/page1", + "https://example.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for different domain", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://other.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for different port", function () { + const result = isSameOrigin( + "https://example.com:443/page1", + "https://example.com:8080/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for subdomain differences", function () { + const result = isSameOrigin( + "https://example.com/page1", + "https://subdomain.example.com/page2" + ); + expect(result).to.be.false; + }); + + it("should return false for malformed URLs", function () { + const result = isSameOrigin("not a url", "https://example.com"); + expect(result).to.be.false; + }); + + it("should handle query parameters", function () { + const result = isSameOrigin( + "https://example.com/page?foo=bar", + "https://example.com/page?baz=qux" + ); + expect(result).to.be.true; + }); + + it("should handle fragments", function () { + const result = isSameOrigin( + "https://example.com/page#section1", + "https://example.com/page#section2" + ); + expect(result).to.be.true; + }); + }); + + describe("resolveRelativeUrl", function () { + let resolveRelativeUrl; + + beforeEach(function () { + const crawler = require("./crawler"); + resolveRelativeUrl = crawler.resolveRelativeUrl; + }); + + it("should resolve relative path against origin", function () { + const result = resolveRelativeUrl( + "/page1", + "https://example.com" + ); + expect(result).to.equal("https://example.com/page1"); + }); + + it("should resolve relative path with ../ navigation", function () { + const result = resolveRelativeUrl( + "../page1", + "https://example.com/dir/subdir/" + ); + expect(result).to.equal("https://example.com/dir/page1"); + }); + + it("should resolve absolute path starting with /", function () { + const result = resolveRelativeUrl( + "/absolute/path", + "https://example.com/some/dir" + ); + expect(result).to.equal("https://example.com/absolute/path"); + }); + + it("should return null for malformed relative URLs", function () { + // Note: URL constructor is quite forgiving, so we need a truly malformed URL + // In practice, most strings can be parsed as relative URLs + const result = resolveRelativeUrl( + "", + "not a valid base" + ); + expect(result).to.be.null; + }); + + it("should return absolute URL unchanged", function () { + const result = resolveRelativeUrl( + "https://other.com/page", + "https://example.com" + ); + expect(result).to.equal("https://other.com/page"); + }); + + it("should handle query parameters in relative URLs", function () { + const result = resolveRelativeUrl( + "/page?foo=bar", + "https://example.com" + ); + expect(result).to.equal("https://example.com/page?foo=bar"); + }); + + it("should handle fragments in relative URLs", function () { + const result = resolveRelativeUrl( + "/page#section", + "https://example.com" + ); + expect(result).to.equal("https://example.com/page#section"); + }); + }); + + describe("crawlUrls", function () { + let crawlUrls, axiosStub, logStub; + + beforeEach(function () { + axiosStub = { + get: sinon.stub(), + }; + logStub = sinon.stub(); + + const crawlerModule = proxyquire("./crawler", { + axios: axiosStub, + "./utils": { log: logStub }, + }); + crawlUrls = crawlerModule.crawlUrls; + }); + + afterEach(function () { + sinon.restore(); + }); + + it("should crawl single URL with no links", async function () { + const config = { logLevel: "info" }; + axiosStub.get.resolves({ data: "No links" }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal(["https://example.com/page1"]); + expect(axiosStub.get.calledOnce).to.be.true; + }); + + it("should crawl same-origin links", async function () { + const config = { logLevel: "info" }; + + axiosStub.get + .withArgs("https://example.com/page1") + .resolves({ + data: 'Link', + }); + + axiosStub.get + .withArgs("https://example.com/page2") + .resolves({ + data: "No more links", + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + expect(axiosStub.get.calledTwice).to.be.true; + }); + + it("should not crawl cross-origin links", async function () { + const config = { logLevel: "info" }; + + axiosStub.get.resolves({ + data: 'External', + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal(["https://example.com/page1"]); + expect(axiosStub.get.calledOnce).to.be.true; + }); + + it("should deduplicate URLs", async function () { + const config = { logLevel: "info" }; + + axiosStub.get + .withArgs("https://example.com/page1") + .resolves({ + data: 'Link', + }); + + axiosStub.get + .withArgs("https://example.com/page2") + .resolves({ + data: 'Back', + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + expect(axiosStub.get.calledTwice).to.be.true; + }); + + it("should handle fetch errors gracefully", async function () { + const config = { logLevel: "info" }; + + axiosStub.get + .withArgs("https://example.com/page1") + .resolves({ + data: 'Link', + }); + + axiosStub.get + .withArgs("https://example.com/page2") + .rejects(new Error("404 Not Found")); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + expect(logStub.calledWith(config, "warn")).to.be.true; + }); + + it("should resolve relative URLs with origin config", async function () { + const config = { logLevel: "info", origin: "https://example.com" }; + + axiosStub.get + .withArgs("https://example.com/page1") + .resolves({ + data: 'Relative Link', + }); + + axiosStub.get + .withArgs("https://example.com/page2") + .resolves({ + data: "No more links", + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + }); + + it("should skip relative URLs without origin config", async function () { + const config = { logLevel: "info" }; + + axiosStub.get.resolves({ + data: 'Relative Link', + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal(["https://example.com/page1"]); + expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true; + }); + + it("should extract URLs from Markdown content", async function () { + const config = { logLevel: "info" }; + + axiosStub.get + .withArgs("https://example.com/page1") + .resolves({ + data: "[Link](https://example.com/page2)", + }); + + axiosStub.get + .withArgs("https://example.com/page2") + .resolves({ + data: "No more links", + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + ]); + }); + + it("should handle non-string content", async function () { + const config = { logLevel: "info" }; + + axiosStub.get.resolves({ data: { json: "object" } }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page1"], + }); + + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + }); +}); diff --git a/src/utils.js b/src/utils.js index 8b0a61c..fb6ee29 100644 --- a/src/utils.js +++ b/src/utils.js @@ -11,6 +11,7 @@ const { transformToSchemaKey, readFile, } = require("doc-detective-common"); +const { crawlUrls } = require("./crawler"); exports.qualifyFiles = qualifyFiles; exports.parseTests = parseTests; @@ -184,6 +185,57 @@ async function qualifyFiles({ config }) { const cleanup = config.afterAll; if (cleanup) sequence = sequence.concat(cleanup); + // Determine if crawling is enabled + let shouldCrawl = false; + if (config.crawl !== undefined) { + // Explicit config setting takes precedence + shouldCrawl = config.crawl === true; + } + + // Collect URLs that should be crawled + const urlsToCrawl = []; + for (const source of sequence) { + const isHttpUrl = + typeof source === "string" && + (source.startsWith("http://") || source.startsWith("https://")); + + if (isHttpUrl) { + // Determine if this specific URL should be crawled + let crawlThisUrl = shouldCrawl; + + // If crawl config is not explicitly set, use protocol-based default + if (config.crawl === undefined) { + crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default + } + + if (crawlThisUrl) { + urlsToCrawl.push(source); + } + } + } + + // Perform crawling if there are URLs to crawl + if (urlsToCrawl.length > 0) { + log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`); + try { + const crawledUrls = await crawlUrls({ + config, + initialUrls: urlsToCrawl, + }); + + // Add newly discovered URLs to the sequence + // Filter out URLs that were already in the initial sequence + const newUrls = crawledUrls.filter((url) => !sequence.includes(url)); + log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`); + + // Add new URLs after the input section but before cleanup + const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length; + sequence.splice(cleanupStartIndex, 0, ...newUrls); + } catch (error) { + log(config, "error", `Crawling failed: ${error.message}`); + } + } + for (let source of sequence) { log(config, "debug", `source: ${source}`); // Check if source is a URL From c529cbf848bb48f25bcf9a47c1d648183f68369c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:23:37 +0000 Subject: [PATCH 3/9] Add XML sitemap support, URL limit test, and config validation handling Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/config.js | 11 ++++++ src/config.test.js | 40 ++++++++++++++++++++++ src/crawler.js | 30 +++++++++++++++- src/crawler.test.js | 83 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 163 insertions(+), 1 deletion(-) diff --git a/src/config.js b/src/config.js index 72eb211..2ca814f 100644 --- a/src/config.js +++ b/src/config.js @@ -478,6 +478,12 @@ async function setConfig({ config }) { } } + // Extract non-schema fields before validation + const nonSchemaFields = { + crawl: config.crawl, + }; + delete config.crawl; + // Validate inbound `config`. const validityCheck = validate({ schemaKey: "config_v3", object: config }); if (!validityCheck.valid) { @@ -491,6 +497,11 @@ async function setConfig({ config }) { } config = validityCheck.object; + // Restore non-schema fields after validation + if (nonSchemaFields.crawl !== undefined) { + config.crawl = nonSchemaFields.crawl; + } + // Replace fileType strings with objects config.fileTypes = config.fileTypes.map((fileType) => { if (typeof fileType === "object") return fileType; diff --git a/src/config.test.js b/src/config.test.js index 3880c5b..610df4b 100644 --- a/src/config.test.js +++ b/src/config.test.js @@ -502,3 +502,43 @@ describe("resolveConcurrentRunners", function () { expect(result.concurrentRunners).to.equal(4); }); }); + +describe("crawl config field", function () { + it("should preserve crawl field through validation", async function () { + const inputConfig = { + input: ["https://example.com"], + crawl: true, + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.equal(true); + }); + + it("should handle crawl field set to false", async function () { + const inputConfig = { + input: ["https://example.com"], + crawl: false, + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.equal(false); + }); + + it("should handle missing crawl field", async function () { + const inputConfig = { + input: ["https://example.com"], + logLevel: "info", + fileTypes: ["markdown"] + }; + + const result = await setConfig({ config: inputConfig }); + + expect(result.crawl).to.be.undefined; + }); +}); diff --git a/src/crawler.js b/src/crawler.js index 62c2ed5..4b442dc 100644 --- a/src/crawler.js +++ b/src/crawler.js @@ -3,6 +3,7 @@ const { log } = require("./utils"); exports.extractHtmlUrls = extractHtmlUrls; exports.extractMarkdownUrls = extractMarkdownUrls; +exports.extractXmlSitemapUrls = extractXmlSitemapUrls; exports.isSameOrigin = isSameOrigin; exports.resolveRelativeUrl = resolveRelativeUrl; exports.crawlUrls = crawlUrls; @@ -63,6 +64,32 @@ function extractMarkdownUrls(markdown) { return urls; } +/** + * Extracts URLs from XML sitemap. + * + * @param {string} xml - The XML sitemap content to parse + * @returns {string[]} - Array of extracted URLs + */ +function extractXmlSitemapUrls(xml) { + if (typeof xml !== "string") { + return []; + } + + const urls = []; + // Match tags in XML sitemaps + const locRegex = /([^<]+)<\/loc>/gi; + let match; + + while ((match = locRegex.exec(xml)) !== null) { + const url = match[1].trim(); + if (url) { + urls.push(url); + } + } + + return urls; +} + /** * Compares two URLs for strict origin matching. * @@ -154,10 +181,11 @@ async function crawlUrls({ config, initialUrls }) { // Extract URLs based on content type let extractedUrls = []; if (typeof content === "string") { - // Try both HTML and Markdown extraction + // Try HTML, Markdown, and XML sitemap extraction extractedUrls = [ ...extractHtmlUrls(content), ...extractMarkdownUrls(content), + ...extractXmlSitemapUrls(content), ]; } diff --git a/src/crawler.test.js b/src/crawler.test.js index 0b17ca5..74432d5 100644 --- a/src/crawler.test.js +++ b/src/crawler.test.js @@ -134,6 +134,63 @@ describe("crawler", function () { }); }); + describe("extractXmlSitemapUrls", function () { + let extractXmlSitemapUrls; + + beforeEach(function () { + const crawler = require("./crawler"); + extractXmlSitemapUrls = crawler.extractXmlSitemapUrls; + }); + + it("should extract single URL from XML sitemap", function () { + const xml = ` + + + https://example.com/page1 + + `; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal(["https://example.com/page1"]); + }); + + it("should extract multiple URLs from XML sitemap", function () { + const xml = ` + + + https://example.com/page1 + + + https://example.com/page2 + + + https://example.com/page3 + + `; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal([ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + ]); + }); + + it("should handle empty string", function () { + const urls = extractXmlSitemapUrls(""); + expect(urls).to.deep.equal([]); + }); + + it("should handle non-string input", function () { + const urls = extractXmlSitemapUrls(null); + expect(urls).to.deep.equal([]); + }); + + it("should handle XML without loc tags", function () { + const xml = "test"; + const urls = extractXmlSitemapUrls(xml); + expect(urls).to.deep.equal([]); + }); + }); + describe("isSameOrigin", function () { let isSameOrigin; @@ -487,5 +544,31 @@ describe("crawler", function () { expect(urls).to.deep.equal(["https://example.com/page1"]); }); + + it("should enforce 10,000 URL limit", async function () { + const config = { logLevel: "info" }; + + // Create a mock that generates many URLs + let callCount = 0; + axiosStub.get.callsFake(async (url) => { + callCount++; + if (callCount <= 10001) { + // Generate unique URLs + return { + data: `Link`, + }; + } + return { data: "No more links" }; + }); + + const urls = await crawlUrls({ + config, + initialUrls: ["https://example.com/page0"], + }); + + // Should stop at 10,000 URLs + expect(urls.length).to.equal(10000); + expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true; + }); }); }); From 7a6ec961befd4e8180e926989fc0ac6177755878 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:27:25 +0000 Subject: [PATCH 4/9] Fix circular dependency by removing log import from crawler Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/crawler.js | 15 +++++++++------ src/crawler.test.js | 11 ++++++++++- src/utils.js | 1 + 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/crawler.js b/src/crawler.js index 4b442dc..0a5f5f7 100644 --- a/src/crawler.js +++ b/src/crawler.js @@ -1,5 +1,4 @@ const axios = require("axios"); -const { log } = require("./utils"); exports.extractHtmlUrls = extractHtmlUrls; exports.extractMarkdownUrls = extractMarkdownUrls; @@ -143,9 +142,13 @@ function resolveRelativeUrl(relativeUrl, baseOrigin) { * @param {Object} options - Crawling options * @param {Object} options.config - Configuration object * @param {string[]} options.initialUrls - Array of initial URLs to crawl + * @param {Function} options.log - Logging function (optional) * @returns {Promise} - Promise resolving to array of all discovered URLs */ -async function crawlUrls({ config, initialUrls }) { +async function crawlUrls({ config, initialUrls, log }) { + // Default no-op logger if not provided + const logger = log || (() => {}); + const visitedUrls = new Set(); const discoveredUrls = []; const MAX_URLS = 10000; @@ -163,7 +166,7 @@ async function crawlUrls({ config, initialUrls }) { visitedUrls.add(currentUrl); discoveredUrls.push(currentUrl); - log(config, "debug", `Crawling: ${currentUrl}`); + logger(config, "debug", `Crawling: ${currentUrl}`); // Fetch the URL content let content; @@ -174,7 +177,7 @@ async function crawlUrls({ config, initialUrls }) { }); content = response.data; } catch (error) { - log(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`); + logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`); continue; } @@ -206,7 +209,7 @@ async function crawlUrls({ config, initialUrls }) { } } else { // No origin configured, skip relative URLs - log( + logger( config, "debug", `Skipping relative URL (no origin configured): ${url}` @@ -226,7 +229,7 @@ async function crawlUrls({ config, initialUrls }) { // Log warning if limit reached if (discoveredUrls.length >= MAX_URLS) { - log( + logger( config, "warn", `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs` diff --git a/src/crawler.test.js b/src/crawler.test.js index 74432d5..6674f9c 100644 --- a/src/crawler.test.js +++ b/src/crawler.test.js @@ -347,7 +347,6 @@ describe("crawler", function () { const crawlerModule = proxyquire("./crawler", { axios: axiosStub, - "./utils": { log: logStub }, }); crawlUrls = crawlerModule.crawlUrls; }); @@ -363,6 +362,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal(["https://example.com/page1"]); @@ -387,6 +387,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal([ @@ -406,6 +407,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal(["https://example.com/page1"]); @@ -430,6 +432,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal([ @@ -455,6 +458,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal([ @@ -482,6 +486,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal([ @@ -500,6 +505,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal(["https://example.com/page1"]); @@ -524,6 +530,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal([ @@ -540,6 +547,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page1"], + log: logStub, }); expect(urls).to.deep.equal(["https://example.com/page1"]); @@ -564,6 +572,7 @@ describe("crawler", function () { const urls = await crawlUrls({ config, initialUrls: ["https://example.com/page0"], + log: logStub, }); // Should stop at 10,000 URLs diff --git a/src/utils.js b/src/utils.js index fb6ee29..36912b7 100644 --- a/src/utils.js +++ b/src/utils.js @@ -221,6 +221,7 @@ async function qualifyFiles({ config }) { const crawledUrls = await crawlUrls({ config, initialUrls: urlsToCrawl, + log, }); // Add newly discovered URLs to the sequence From e4cb209a23fe82cfd215b957e4d6a3f4b51cf203 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 30 Oct 2025 13:30:42 +0000 Subject: [PATCH 5/9] Add security filtering for unsafe URL schemes (vbscript, data) Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/crawler.js | 10 +++++++++- src/crawler.test.js | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/crawler.js b/src/crawler.js index 0a5f5f7..77b78cf 100644 --- a/src/crawler.js +++ b/src/crawler.js @@ -26,7 +26,15 @@ function extractHtmlUrls(html) { while ((match = anchorRegex.exec(html)) !== null) { const url = match[1]; - if (url && url !== "#" && !url.startsWith("javascript:")) { + // Filter out unsafe URL schemes + const lowerUrl = url.toLowerCase(); + if ( + url && + url !== "#" && + !lowerUrl.startsWith("javascript:") && + !lowerUrl.startsWith("vbscript:") && + !lowerUrl.startsWith("data:") + ) { urls.push(url); } } diff --git a/src/crawler.test.js b/src/crawler.test.js index 6674f9c..63e51da 100644 --- a/src/crawler.test.js +++ b/src/crawler.test.js @@ -60,6 +60,18 @@ describe("crawler", function () { expect(urls).to.deep.equal(["https://example.com"]); }); + it("should ignore vbscript: links", function () { + const html = 'VB LinkLink'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["https://example.com"]); + }); + + it("should ignore data: links", function () { + const html = 'Data LinkLink'; + const urls = extractHtmlUrls(html); + expect(urls).to.deep.equal(["https://example.com"]); + }); + it("should handle empty string", function () { const urls = extractHtmlUrls(""); expect(urls).to.deep.equal([]); From 6c5378f6472c129f52478abb0893f967e2c6273f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 31 Oct 2025 00:16:51 +0000 Subject: [PATCH 6/9] Reduce scope to only support sitemap.xml URLs - Remove HTML and Markdown URL extraction - Only process URLs ending with sitemap.xml - Simplify crawler to crawlSitemap function - Update tests to reflect reduced scope - 66 tests passing (down from 96) Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/crawler.integration.test.js | 95 ++++--- src/crawler.js | 200 +++------------ src/crawler.test.js | 421 +++----------------------------- src/utils.js | 42 ++-- 4 files changed, 128 insertions(+), 630 deletions(-) diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js index d6f8797..7d4ab9e 100644 --- a/src/crawler.integration.test.js +++ b/src/crawler.integration.test.js @@ -8,7 +8,7 @@ before(async function () { }); describe("crawler integration", function () { - let qualifyFiles, axiosStub, fsStub, logStub, crawlUrlsStub, readFileStub; + let qualifyFiles, axiosStub, fsStub, crawlSitemapStub, readFileStub; beforeEach(function () { axiosStub = { @@ -23,18 +23,19 @@ describe("crawler integration", function () { writeFileSync: sinon.stub(), }; - logStub = sinon.stub(); - crawlUrlsStub = sinon.stub(); + crawlSitemapStub = sinon.stub(); readFileStub = sinon.stub().resolves({}); // Mock fetchFile behavior axiosStub.get.callsFake(async (url) => { - if (url === "https://example.com/page1") { + if (url.endsWith("sitemap.xml")) { return { - data: 'Link', + data: ` + + https://example.com/page1 + https://example.com/page2 + `, }; - } else if (url === "https://example.com/page2") { - return { data: "Content" }; } return { data: "" }; }); @@ -42,7 +43,7 @@ describe("crawler integration", function () { const utilsModule = proxyquire("./utils", { axios: axiosStub, fs: fsStub, - "./crawler": { crawlUrls: crawlUrlsStub }, + "./crawler": { crawlSitemap: crawlSitemapStub }, "doc-detective-common": { validate: () => ({ valid: true }), resolvePaths: (x) => x, @@ -58,14 +59,14 @@ describe("crawler integration", function () { sinon.restore(); }); - it("should enable crawling by default for HTTP URLs", async function () { + it("should process sitemap.xml URLs by default", async function () { const config = { - input: ["https://example.com/page1"], + input: ["https://example.com/sitemap.xml"], logLevel: "info", fileTypes: [], }; - crawlUrlsStub.resolves([ + crawlSitemapStub.resolves([ "https://example.com/page1", "https://example.com/page2", ]); @@ -76,16 +77,13 @@ describe("crawler integration", function () { await qualifyFiles({ config }); - expect(crawlUrlsStub.calledOnce).to.be.true; - expect(crawlUrlsStub.firstCall.args[0].initialUrls).to.deep.equal([ - "https://example.com/page1", - ]); + expect(crawlSitemapStub.calledOnce).to.be.true; + expect(crawlSitemapStub.firstCall.args[0].sitemapUrl).to.equal("https://example.com/sitemap.xml"); }); - it("should disable crawling when crawl is false", async function () { + it("should not process non-sitemap URLs", async function () { const config = { - input: ["https://example.com/page1"], - crawl: false, + input: ["https://example.com/page.html"], logLevel: "info", fileTypes: [], }; @@ -96,78 +94,69 @@ describe("crawler integration", function () { await qualifyFiles({ config }); - expect(crawlUrlsStub.called).to.be.false; + expect(crawlSitemapStub.called).to.be.false; }); - it("should enable crawling when crawl is true", async function () { + it("should disable processing when crawl is false", async function () { const config = { - input: ["https://example.com/page1"], - crawl: true, + input: ["https://example.com/sitemap.xml"], + crawl: false, logLevel: "info", fileTypes: [], }; - crawlUrlsStub.resolves([ - "https://example.com/page1", - "https://example.com/page2", - ]); - // Mock file system calls for fetched files fsStub.existsSync.returns(true); fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); await qualifyFiles({ config }); - expect(crawlUrlsStub.calledOnce).to.be.true; + expect(crawlSitemapStub.called).to.be.false; }); - it("should not crawl file:// URLs by default", async function () { + it("should enable processing when crawl is true", async function () { const config = { - input: [], // Empty input to avoid processing issues + input: ["https://example.com/sitemap.xml"], + crawl: true, logLevel: "info", fileTypes: [], }; - // file:// URLs won't trigger crawling since they don't start with http:// or https:// - // This test just verifies no crawling happens + crawlSitemapStub.resolves([ + "https://example.com/page1", + "https://example.com/page2", + ]); + + // Mock file system calls for fetched files + fsStub.existsSync.returns(true); + fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); await qualifyFiles({ config }); - expect(crawlUrlsStub.called).to.be.false; + expect(crawlSitemapStub.calledOnce).to.be.true; }); - it("should pass origin config to crawler", async function () { + it("should not process file:// URLs", async function () { const config = { - input: ["https://example.com/page1"], - origin: "https://example.com", - crawl: true, + input: [], logLevel: "info", fileTypes: [], }; - crawlUrlsStub.resolves(["https://example.com/page1"]); - - // Mock file system calls for fetched files - fsStub.existsSync.returns(true); - fsStub.statSync.returns({ isFile: () => true, isDirectory: () => false }); - await qualifyFiles({ config }); - expect(crawlUrlsStub.calledOnce).to.be.true; - expect(crawlUrlsStub.firstCall.args[0].config.origin).to.equal( - "https://example.com" - ); + expect(crawlSitemapStub.called).to.be.false; }); - it("should log crawling activity", async function () { + it("should log sitemap processing activity", async function () { const config = { - input: ["https://example.com/page1"], + input: ["https://example.com/sitemap.xml"], crawl: true, logLevel: "info", fileTypes: [], }; - crawlUrlsStub.resolves([ + crawlSitemapStub.resolves([ "https://example.com/page1", "https://example.com/page2", ]); @@ -187,11 +176,11 @@ describe("crawler integration", function () { try { await qualifyFiles({ config }); - // Check that crawling info was logged - const hasCrawlingLog = logOutput.some((msg) => msg.includes("Crawling")); + // Check that processing info was logged + const hasProcessingLog = logOutput.some((msg) => msg.includes("Processing") && msg.includes("sitemap")); const hasDiscoveredLog = logOutput.some((msg) => msg.includes("Discovered")); - expect(hasCrawlingLog).to.be.true; + expect(hasProcessingLog).to.be.true; expect(hasDiscoveredLog).to.be.true; } finally { console.log = originalConsoleLog; diff --git a/src/crawler.js b/src/crawler.js index 77b78cf..4549b77 100644 --- a/src/crawler.js +++ b/src/crawler.js @@ -1,75 +1,8 @@ const axios = require("axios"); -exports.extractHtmlUrls = extractHtmlUrls; -exports.extractMarkdownUrls = extractMarkdownUrls; exports.extractXmlSitemapUrls = extractXmlSitemapUrls; exports.isSameOrigin = isSameOrigin; -exports.resolveRelativeUrl = resolveRelativeUrl; -exports.crawlUrls = crawlUrls; - -/** - * Extracts URLs from HTML tags with href attributes. - * - * @param {string} html - The HTML content to parse - * @returns {string[]} - Array of extracted URLs - */ -function extractHtmlUrls(html) { - if (typeof html !== "string") { - return []; - } - - const urls = []; - // Match tags with href attributes - // This regex handles various formats: href="url", href='url', href=url - const anchorRegex = /]*?\s+)?href=["']?([^"'\s>]+)["']?[^>]*>/gi; - let match; - - while ((match = anchorRegex.exec(html)) !== null) { - const url = match[1]; - // Filter out unsafe URL schemes - const lowerUrl = url.toLowerCase(); - if ( - url && - url !== "#" && - !lowerUrl.startsWith("javascript:") && - !lowerUrl.startsWith("vbscript:") && - !lowerUrl.startsWith("data:") - ) { - urls.push(url); - } - } - - return urls; -} - -/** - * Extracts URLs from Markdown [text](url) syntax. - * - * @param {string} markdown - The Markdown content to parse - * @returns {string[]} - Array of extracted URLs - */ -function extractMarkdownUrls(markdown) { - if (typeof markdown !== "string") { - return []; - } - - const urls = []; - // Match [text](url) syntax, handling escaped brackets - // This regex avoids matching image syntax ![text](url) - const linkRegex = /(?} - Promise resolving to array of all discovered URLs */ -async function crawlUrls({ config, initialUrls, log }) { +async function crawlSitemap({ config, sitemapUrl, log }) { // Default no-op logger if not provided const logger = log || (() => {}); - const visitedUrls = new Set(); const discoveredUrls = []; - const MAX_URLS = 10000; - let urlQueue = [...initialUrls]; - // Process each URL in the queue - while (urlQueue.length > 0 && discoveredUrls.length < MAX_URLS) { - const currentUrl = urlQueue.shift(); - - // Skip if already visited - if (visitedUrls.has(currentUrl)) { - continue; - } - - visitedUrls.add(currentUrl); - discoveredUrls.push(currentUrl); - - logger(config, "debug", `Crawling: ${currentUrl}`); - - // Fetch the URL content - let content; - try { - const response = await axios.get(currentUrl, { - timeout: 30000, - maxRedirects: 5, - }); - content = response.data; - } catch (error) { - logger(config, "warn", `Failed to fetch ${currentUrl}: ${error.message}`); - continue; - } - - // Extract URLs based on content type - let extractedUrls = []; - if (typeof content === "string") { - // Try HTML, Markdown, and XML sitemap extraction - extractedUrls = [ - ...extractHtmlUrls(content), - ...extractMarkdownUrls(content), - ...extractXmlSitemapUrls(content), - ]; - } + logger(config, "debug", `Processing sitemap: ${sitemapUrl}`); + + // Fetch the sitemap content + let content; + try { + const response = await axios.get(sitemapUrl, { + timeout: 30000, + maxRedirects: 5, + }); + content = response.data; + } catch (error) { + logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`); + return discoveredUrls; + } + + // Extract URLs from sitemap + if (typeof content === "string") { + const extractedUrls = extractXmlSitemapUrls(content); - // Process extracted URLs + // Filter URLs to only include same-origin URLs for (const url of extractedUrls) { - let absoluteUrl; - - // Check if URL is relative - try { - new URL(url); - absoluteUrl = url; - } catch { - // It's relative - if (config.origin) { - absoluteUrl = resolveRelativeUrl(url, config.origin); - if (!absoluteUrl) { - continue; // Skip malformed URLs - } - } else { - // No origin configured, skip relative URLs - logger( - config, - "debug", - `Skipping relative URL (no origin configured): ${url}` - ); - continue; - } - } - - // Check if same origin as current URL - if (isSameOrigin(absoluteUrl, currentUrl)) { - if (!visitedUrls.has(absoluteUrl)) { - urlQueue.push(absoluteUrl); - } + if (isSameOrigin(url, sitemapUrl)) { + discoveredUrls.push(url); + } else { + logger(config, "debug", `Skipping cross-origin URL: ${url}`); } } } - // Log warning if limit reached - if (discoveredUrls.length >= MAX_URLS) { - logger( - config, - "warn", - `Crawling stopped: reached maximum limit of ${MAX_URLS} URLs` - ); - } + logger(config, "info", `Discovered ${discoveredUrls.length} URL(s) from sitemap`); return discoveredUrls; } diff --git a/src/crawler.test.js b/src/crawler.test.js index 63e51da..54a07e8 100644 --- a/src/crawler.test.js +++ b/src/crawler.test.js @@ -8,144 +8,6 @@ before(async function () { }); describe("crawler", function () { - describe("extractHtmlUrls", function () { - let extractHtmlUrls; - - beforeEach(function () { - const crawler = require("./crawler"); - extractHtmlUrls = crawler.extractHtmlUrls; - }); - - it("should extract single URL from HTML", function () { - const html = 'Link'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["https://example.com/page1"]); - }); - - it("should extract multiple URLs from HTML", function () { - const html = ` - Link 1 - Link 2 - Link 3 - `; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3", - ]); - }); - - it("should handle single and double quotes", function () { - const html = ` - Link 1 - Link 2 - `; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - ]); - }); - - it("should ignore anchor links", function () { - const html = 'AnchorLink'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["https://example.com"]); - }); - - it("should ignore javascript: links", function () { - const html = 'JS LinkLink'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["https://example.com"]); - }); - - it("should ignore vbscript: links", function () { - const html = 'VB LinkLink'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["https://example.com"]); - }); - - it("should ignore data: links", function () { - const html = 'Data LinkLink'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["https://example.com"]); - }); - - it("should handle empty string", function () { - const urls = extractHtmlUrls(""); - expect(urls).to.deep.equal([]); - }); - - it("should handle non-string input", function () { - const urls = extractHtmlUrls(null); - expect(urls).to.deep.equal([]); - }); - - it("should extract relative URLs", function () { - const html = 'RelativeAbsolute'; - const urls = extractHtmlUrls(html); - expect(urls).to.deep.equal(["/page1", "https://example.com"]); - }); - }); - - describe("extractMarkdownUrls", function () { - let extractMarkdownUrls; - - beforeEach(function () { - const crawler = require("./crawler"); - extractMarkdownUrls = crawler.extractMarkdownUrls; - }); - - it("should extract single URL from Markdown", function () { - const markdown = "[Link](https://example.com/page1)"; - const urls = extractMarkdownUrls(markdown); - expect(urls).to.deep.equal(["https://example.com/page1"]); - }); - - it("should extract multiple URLs from Markdown", function () { - const markdown = ` - [Link 1](https://example.com/page1) - [Link 2](https://example.com/page2) - [Link 3](https://example.com/page3) - `; - const urls = extractMarkdownUrls(markdown); - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - "https://example.com/page3", - ]); - }); - - it("should ignore image syntax", function () { - const markdown = "![Image](https://example.com/image.png) [Link](https://example.com/page1)"; - const urls = extractMarkdownUrls(markdown); - expect(urls).to.deep.equal(["https://example.com/page1"]); - }); - - it("should handle URLs with title text", function () { - const markdown = '[Link](https://example.com/page1 "Title text")'; - const urls = extractMarkdownUrls(markdown); - expect(urls).to.deep.equal(["https://example.com/page1"]); - }); - - it("should handle empty string", function () { - const urls = extractMarkdownUrls(""); - expect(urls).to.deep.equal([]); - }); - - it("should handle non-string input", function () { - const urls = extractMarkdownUrls(null); - expect(urls).to.deep.equal([]); - }); - - it("should extract relative URLs", function () { - const markdown = "[Relative](/page1) [Absolute](https://example.com)"; - const urls = extractMarkdownUrls(markdown); - expect(urls).to.deep.equal(["/page1", "https://example.com"]); - }); - }); - describe("extractXmlSitemapUrls", function () { let extractXmlSitemapUrls; @@ -281,75 +143,8 @@ describe("crawler", function () { }); }); - describe("resolveRelativeUrl", function () { - let resolveRelativeUrl; - - beforeEach(function () { - const crawler = require("./crawler"); - resolveRelativeUrl = crawler.resolveRelativeUrl; - }); - - it("should resolve relative path against origin", function () { - const result = resolveRelativeUrl( - "/page1", - "https://example.com" - ); - expect(result).to.equal("https://example.com/page1"); - }); - - it("should resolve relative path with ../ navigation", function () { - const result = resolveRelativeUrl( - "../page1", - "https://example.com/dir/subdir/" - ); - expect(result).to.equal("https://example.com/dir/page1"); - }); - - it("should resolve absolute path starting with /", function () { - const result = resolveRelativeUrl( - "/absolute/path", - "https://example.com/some/dir" - ); - expect(result).to.equal("https://example.com/absolute/path"); - }); - - it("should return null for malformed relative URLs", function () { - // Note: URL constructor is quite forgiving, so we need a truly malformed URL - // In practice, most strings can be parsed as relative URLs - const result = resolveRelativeUrl( - "", - "not a valid base" - ); - expect(result).to.be.null; - }); - - it("should return absolute URL unchanged", function () { - const result = resolveRelativeUrl( - "https://other.com/page", - "https://example.com" - ); - expect(result).to.equal("https://other.com/page"); - }); - - it("should handle query parameters in relative URLs", function () { - const result = resolveRelativeUrl( - "/page?foo=bar", - "https://example.com" - ); - expect(result).to.equal("https://example.com/page?foo=bar"); - }); - - it("should handle fragments in relative URLs", function () { - const result = resolveRelativeUrl( - "/page#section", - "https://example.com" - ); - expect(result).to.equal("https://example.com/page#section"); - }); - }); - - describe("crawlUrls", function () { - let crawlUrls, axiosStub, logStub; + describe("crawlSitemap", function () { + let crawlSitemap, axiosStub, logStub; beforeEach(function () { axiosStub = { @@ -360,45 +155,27 @@ describe("crawler", function () { const crawlerModule = proxyquire("./crawler", { axios: axiosStub, }); - crawlUrls = crawlerModule.crawlUrls; + crawlSitemap = crawlerModule.crawlSitemap; }); afterEach(function () { sinon.restore(); }); - it("should crawl single URL with no links", async function () { - const config = { logLevel: "info" }; - axiosStub.get.resolves({ data: "No links" }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page1"], - log: logStub, - }); - - expect(urls).to.deep.equal(["https://example.com/page1"]); - expect(axiosStub.get.calledOnce).to.be.true; - }); - - it("should crawl same-origin links", async function () { + it("should process sitemap and extract same-origin URLs", async function () { const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + const sitemapContent = ` + + https://example.com/page1 + https://example.com/page2 + `; - axiosStub.get - .withArgs("https://example.com/page1") - .resolves({ - data: 'Link', - }); - - axiosStub.get - .withArgs("https://example.com/page2") - .resolves({ - data: "No more links", - }); + axiosStub.get.resolves({ data: sitemapContent }); - const urls = await crawlUrls({ + const urls = await crawlSitemap({ config, - initialUrls: ["https://example.com/page1"], + sitemapUrl, log: logStub, }); @@ -406,190 +183,58 @@ describe("crawler", function () { "https://example.com/page1", "https://example.com/page2", ]); - expect(axiosStub.get.calledTwice).to.be.true; - }); - - it("should not crawl cross-origin links", async function () { - const config = { logLevel: "info" }; - - axiosStub.get.resolves({ - data: 'External', - }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page1"], - log: logStub, - }); - - expect(urls).to.deep.equal(["https://example.com/page1"]); expect(axiosStub.get.calledOnce).to.be.true; }); - it("should deduplicate URLs", async function () { + it("should filter out cross-origin URLs", async function () { const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; + const sitemapContent = ` + + https://example.com/page1 + https://other.com/page2 + `; - axiosStub.get - .withArgs("https://example.com/page1") - .resolves({ - data: 'Link', - }); - - axiosStub.get - .withArgs("https://example.com/page2") - .resolves({ - data: 'Back', - }); + axiosStub.get.resolves({ data: sitemapContent }); - const urls = await crawlUrls({ + const urls = await crawlSitemap({ config, - initialUrls: ["https://example.com/page1"], + sitemapUrl, log: logStub, }); - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - ]); - expect(axiosStub.get.calledTwice).to.be.true; + expect(urls).to.deep.equal(["https://example.com/page1"]); }); it("should handle fetch errors gracefully", async function () { const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; - axiosStub.get - .withArgs("https://example.com/page1") - .resolves({ - data: 'Link', - }); - - axiosStub.get - .withArgs("https://example.com/page2") - .rejects(new Error("404 Not Found")); + axiosStub.get.rejects(new Error("404 Not Found")); - const urls = await crawlUrls({ + const urls = await crawlSitemap({ config, - initialUrls: ["https://example.com/page1"], + sitemapUrl, log: logStub, }); - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - ]); + expect(urls).to.deep.equal([]); expect(logStub.calledWith(config, "warn")).to.be.true; }); - it("should resolve relative URLs with origin config", async function () { - const config = { logLevel: "info", origin: "https://example.com" }; - - axiosStub.get - .withArgs("https://example.com/page1") - .resolves({ - data: 'Relative Link', - }); - - axiosStub.get - .withArgs("https://example.com/page2") - .resolves({ - data: "No more links", - }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page1"], - log: logStub, - }); - - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - ]); - }); - - it("should skip relative URLs without origin config", async function () { - const config = { logLevel: "info" }; - - axiosStub.get.resolves({ - data: 'Relative Link', - }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page1"], - log: logStub, - }); - - expect(urls).to.deep.equal(["https://example.com/page1"]); - expect(logStub.calledWith(config, "debug", sinon.match(/Skipping relative URL/))).to.be.true; - }); - - it("should extract URLs from Markdown content", async function () { - const config = { logLevel: "info" }; - - axiosStub.get - .withArgs("https://example.com/page1") - .resolves({ - data: "[Link](https://example.com/page2)", - }); - - axiosStub.get - .withArgs("https://example.com/page2") - .resolves({ - data: "No more links", - }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page1"], - log: logStub, - }); - - expect(urls).to.deep.equal([ - "https://example.com/page1", - "https://example.com/page2", - ]); - }); - it("should handle non-string content", async function () { const config = { logLevel: "info" }; + const sitemapUrl = "https://example.com/sitemap.xml"; axiosStub.get.resolves({ data: { json: "object" } }); - const urls = await crawlUrls({ + const urls = await crawlSitemap({ config, - initialUrls: ["https://example.com/page1"], + sitemapUrl, log: logStub, }); - expect(urls).to.deep.equal(["https://example.com/page1"]); - }); - - it("should enforce 10,000 URL limit", async function () { - const config = { logLevel: "info" }; - - // Create a mock that generates many URLs - let callCount = 0; - axiosStub.get.callsFake(async (url) => { - callCount++; - if (callCount <= 10001) { - // Generate unique URLs - return { - data: `Link`, - }; - } - return { data: "No more links" }; - }); - - const urls = await crawlUrls({ - config, - initialUrls: ["https://example.com/page0"], - log: logStub, - }); - - // Should stop at 10,000 URLs - expect(urls.length).to.equal(10000); - expect(logStub.calledWith(config, "warn", sinon.match(/maximum limit/))).to.be.true; + expect(urls).to.deep.equal([]); }); }); }); diff --git a/src/utils.js b/src/utils.js index 36912b7..ab4fa47 100644 --- a/src/utils.js +++ b/src/utils.js @@ -11,7 +11,7 @@ const { transformToSchemaKey, readFile, } = require("doc-detective-common"); -const { crawlUrls } = require("./crawler"); +const { crawlSitemap } = require("./crawler"); exports.qualifyFiles = qualifyFiles; exports.parseTests = parseTests; @@ -192,48 +192,56 @@ async function qualifyFiles({ config }) { shouldCrawl = config.crawl === true; } - // Collect URLs that should be crawled - const urlsToCrawl = []; + // Collect sitemap.xml URLs that should be crawled + const sitemapsToProcess = []; for (const source of sequence) { const isHttpUrl = typeof source === "string" && (source.startsWith("http://") || source.startsWith("https://")); + + const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml"); - if (isHttpUrl) { + if (isHttpUrl && isSitemapUrl) { // Determine if this specific URL should be crawled let crawlThisUrl = shouldCrawl; // If crawl config is not explicitly set, use protocol-based default if (config.crawl === undefined) { - crawlThisUrl = true; // HTTPS/HTTP URLs crawled by default + crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default } if (crawlThisUrl) { - urlsToCrawl.push(source); + sitemapsToProcess.push(source); } } } - // Perform crawling if there are URLs to crawl - if (urlsToCrawl.length > 0) { - log(config, "info", `Crawling ${urlsToCrawl.length} URL(s)...`); + // Process sitemaps if there are any to crawl + if (sitemapsToProcess.length > 0) { + log(config, "info", `Processing ${sitemapsToProcess.length} sitemap(s)...`); try { - const crawledUrls = await crawlUrls({ - config, - initialUrls: urlsToCrawl, - log, - }); + const allDiscoveredUrls = []; + + // Process each sitemap + for (const sitemapUrl of sitemapsToProcess) { + const discoveredUrls = await crawlSitemap({ + config, + sitemapUrl, + log, + }); + allDiscoveredUrls.push(...discoveredUrls); + } // Add newly discovered URLs to the sequence // Filter out URLs that were already in the initial sequence - const newUrls = crawledUrls.filter((url) => !sequence.includes(url)); - log(config, "info", `Discovered ${newUrls.length} additional URL(s) via crawling`); + const newUrls = allDiscoveredUrls.filter((url) => !sequence.includes(url)); + log(config, "info", `Discovered ${newUrls.length} additional URL(s) from sitemap(s)`); // Add new URLs after the input section but before cleanup const cleanupStartIndex = cleanup ? sequence.indexOf(cleanup[0]) : sequence.length; sequence.splice(cleanupStartIndex, 0, ...newUrls); } catch (error) { - log(config, "error", `Crawling failed: ${error.message}`); + log(config, "error", `Sitemap processing failed: ${error.message}`); } } From 3f247a1dcb1676cdfe4005d017a903af03de77b3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 31 Oct 2025 00:38:07 +0000 Subject: [PATCH 7/9] Pin to doc-detective-common@3.4.0-dev.3 and remove crawl field workaround - Update package.json to use doc-detective-common@3.4.0-dev.3 - Remove code that strips/restores crawl field during validation (now in schema) - Simplify utils.js logic since config.crawl always defined (defaults to false) - Update tests to reflect new default behavior (crawl defaults to false) - All 66 tests passing Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- package-lock.json | 8 ++++---- package.json | 2 +- src/config.js | 11 ----------- src/config.test.js | 4 ++-- src/crawler.integration.test.js | 3 ++- src/utils.js | 18 ++---------------- 6 files changed, 11 insertions(+), 35 deletions(-) diff --git a/package-lock.json b/package-lock.json index 357fc7b..ee9c42c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,7 +12,7 @@ "@apidevtools/json-schema-ref-parser": "^14.2.1", "ajv": "^8.17.1", "axios": "^1.13.1", - "doc-detective-common": "3.4.0-dev.2", + "doc-detective-common": "3.4.0-dev.3", "dotenv": "^17.2.3", "json-schema-faker": "^0.5.9", "posthog-node": "^5.10.4" @@ -700,9 +700,9 @@ } }, "node_modules/doc-detective-common": { - "version": "3.4.0-dev.2", - "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.2.tgz", - "integrity": "sha512-COI9K9jBTwMJmlDLVUksF2WTCKnNlIJhr7pgUUR5VAX6Wt0t6DLfAkbHOUROVFWeBAPW1yVwuLRTQkmNxoR9Dw==", + "version": "3.4.0-dev.3", + "resolved": "https://registry.npmjs.org/doc-detective-common/-/doc-detective-common-3.4.0-dev.3.tgz", + "integrity": "sha512-ZZohuQ7qTxwjXkaehyEJA/5hM5oKQ2J1B6Ue6zTbRl0MJcKtH+iF9X5Vl9aqyEplwPmUabxtzO6m0H4PZKko0g==", "license": "AGPL-3.0-only", "dependencies": { "@apidevtools/json-schema-ref-parser": "^14.2.1", diff --git a/package.json b/package.json index 0cf5f87..c5ccb0b 100644 --- a/package.json +++ b/package.json @@ -27,7 +27,7 @@ "@apidevtools/json-schema-ref-parser": "^14.2.1", "ajv": "^8.17.1", "axios": "^1.13.1", - "doc-detective-common": "3.4.0-dev.2", + "doc-detective-common": "3.4.0-dev.3", "dotenv": "^17.2.3", "json-schema-faker": "^0.5.9", "posthog-node": "^5.10.4" diff --git a/src/config.js b/src/config.js index 2ca814f..72eb211 100644 --- a/src/config.js +++ b/src/config.js @@ -478,12 +478,6 @@ async function setConfig({ config }) { } } - // Extract non-schema fields before validation - const nonSchemaFields = { - crawl: config.crawl, - }; - delete config.crawl; - // Validate inbound `config`. const validityCheck = validate({ schemaKey: "config_v3", object: config }); if (!validityCheck.valid) { @@ -497,11 +491,6 @@ async function setConfig({ config }) { } config = validityCheck.object; - // Restore non-schema fields after validation - if (nonSchemaFields.crawl !== undefined) { - config.crawl = nonSchemaFields.crawl; - } - // Replace fileType strings with objects config.fileTypes = config.fileTypes.map((fileType) => { if (typeof fileType === "object") return fileType; diff --git a/src/config.test.js b/src/config.test.js index 610df4b..dc8e283 100644 --- a/src/config.test.js +++ b/src/config.test.js @@ -530,7 +530,7 @@ describe("crawl config field", function () { expect(result.crawl).to.equal(false); }); - it("should handle missing crawl field", async function () { + it("should default crawl field to false when not specified", async function () { const inputConfig = { input: ["https://example.com"], logLevel: "info", @@ -539,6 +539,6 @@ describe("crawl config field", function () { const result = await setConfig({ config: inputConfig }); - expect(result.crawl).to.be.undefined; + expect(result.crawl).to.equal(false); }); }); diff --git a/src/crawler.integration.test.js b/src/crawler.integration.test.js index 7d4ab9e..ad885c3 100644 --- a/src/crawler.integration.test.js +++ b/src/crawler.integration.test.js @@ -59,9 +59,10 @@ describe("crawler integration", function () { sinon.restore(); }); - it("should process sitemap.xml URLs by default", async function () { + it("should process sitemap.xml URLs when crawl is true", async function () { const config = { input: ["https://example.com/sitemap.xml"], + crawl: true, logLevel: "info", fileTypes: [], }; diff --git a/src/utils.js b/src/utils.js index ab4fa47..9253cdf 100644 --- a/src/utils.js +++ b/src/utils.js @@ -185,13 +185,6 @@ async function qualifyFiles({ config }) { const cleanup = config.afterAll; if (cleanup) sequence = sequence.concat(cleanup); - // Determine if crawling is enabled - let shouldCrawl = false; - if (config.crawl !== undefined) { - // Explicit config setting takes precedence - shouldCrawl = config.crawl === true; - } - // Collect sitemap.xml URLs that should be crawled const sitemapsToProcess = []; for (const source of sequence) { @@ -202,15 +195,8 @@ async function qualifyFiles({ config }) { const isSitemapUrl = typeof source === "string" && source.endsWith("sitemap.xml"); if (isHttpUrl && isSitemapUrl) { - // Determine if this specific URL should be crawled - let crawlThisUrl = shouldCrawl; - - // If crawl config is not explicitly set, use protocol-based default - if (config.crawl === undefined) { - crawlThisUrl = true; // HTTPS/HTTP sitemap.xml URLs crawled by default - } - - if (crawlThisUrl) { + // Check if crawling is enabled (defaults to false in config) + if (config.crawl === true) { sitemapsToProcess.push(source); } } From b11bd5085ab0667ea91083a69e510041306738f3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 4 Nov 2025 13:47:59 +0000 Subject: [PATCH 8/9] Fix origin matching to use final URL after redirects - Update crawlSitemap to track final URL after axios redirects - Use responseUrl from axios response for origin comparison - Fixes issue where www.doc-detective.com redirects to doc-detective.com - Now correctly discovers 125 URLs from doc-detective.com sitemap - All 66 tests passing Co-authored-by: hawkeyexl <5209367+hawkeyexl@users.noreply.github.com> --- src/crawler.js | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/crawler.js b/src/crawler.js index 4549b77..b97fd95 100644 --- a/src/crawler.js +++ b/src/crawler.js @@ -73,12 +73,19 @@ async function crawlSitemap({ config, sitemapUrl, log }) { // Fetch the sitemap content let content; + let finalUrl = sitemapUrl; try { const response = await axios.get(sitemapUrl, { timeout: 30000, maxRedirects: 5, }); content = response.data; + + // Use the final URL after redirects for origin comparison + if (response.request && response.request.res && response.request.res.responseUrl) { + finalUrl = response.request.res.responseUrl; + logger(config, "debug", `Sitemap redirected to: ${finalUrl}`); + } } catch (error) { logger(config, "warn", `Failed to fetch sitemap ${sitemapUrl}: ${error.message}`); return discoveredUrls; @@ -88,9 +95,9 @@ async function crawlSitemap({ config, sitemapUrl, log }) { if (typeof content === "string") { const extractedUrls = extractXmlSitemapUrls(content); - // Filter URLs to only include same-origin URLs + // Filter URLs to only include same-origin URLs (using final URL after redirects) for (const url of extractedUrls) { - if (isSameOrigin(url, sitemapUrl)) { + if (isSameOrigin(url, finalUrl)) { discoveredUrls.push(url); } else { logger(config, "debug", `Skipping cross-origin URL: ${url}`); From 50dac4eb9e8eda86c40af2817affe271e94ae23f Mon Sep 17 00:00:00 2001 From: hawkeyexl Date: Tue, 4 Nov 2025 13:34:31 -0500 Subject: [PATCH 9/9] Save as HTML --- dev/index.js | 3 ++- src/utils.js | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/dev/index.js b/dev/index.js index 847624b..03afb33 100644 --- a/dev/index.js +++ b/dev/index.js @@ -12,7 +12,8 @@ main(); */ async function main() { const json = { - input: "dev/doc-content.dita", + input: "https://www.doc-detective.com/sitemap.xml", + crawl: true, logLevel: "debug", runOn: [ { diff --git a/src/utils.js b/src/utils.js index 9253cdf..39bcd78 100644 --- a/src/utils.js +++ b/src/utils.js @@ -154,7 +154,11 @@ async function fetchFile(fileURL) { } else { response.data = response.data.toString(); } - const fileName = fileURL.split("/").pop(); + let fileName = fileURL.split("/").pop(); + // If fileName doesn't have an extension, add ".html" + if (!path.extname(fileName)) { + fileName += ".html"; + } const hash = crypto.createHash("md5").update(response.data).digest("hex"); const filePath = `${os.tmpdir}/doc-detective/${hash}_${fileName}`; // If doc-detective temp directory doesn't exist, create it